aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt30
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.c2
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h119
-rw-r--r--arch/x86/include/asm/kvm_host.h54
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/svm.h36
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/cpuid.c10
-rw-r--r--arch/x86/kvm/emulate.c20
-rw-r--r--arch/x86/kvm/hyperv.c185
-rw-r--r--arch/x86/kvm/hyperv.h6
-rw-r--r--arch/x86/kvm/i8259.c8
-rw-r--r--arch/x86/kvm/ioapic.c6
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c14
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h14
-rw-r--r--arch/x86/kvm/lapic.c36
-rw-r--r--arch/x86/kvm/mmu.h38
-rw-r--r--arch/x86/kvm/mmu/mmu.c134
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c303
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h23
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/mmu/spte.c72
-rw-r--r--arch/x86/kvm/mmu/spte.h129
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c8
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h10
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c487
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h11
-rw-r--r--arch/x86/kvm/pmu.c7
-rw-r--r--arch/x86/kvm/svm/avic.c134
-rw-r--r--arch/x86/kvm/svm/hyperv.h35
-rw-r--r--arch/x86/kvm/svm/nested.c71
-rw-r--r--arch/x86/kvm/svm/sev.c85
-rw-r--r--arch/x86/kvm/svm/svm.c219
-rw-r--r--arch/x86/kvm/svm/svm.h69
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h25
-rw-r--r--arch/x86/kvm/trace.h14
-rw-r--r--arch/x86/kvm/vmx/nested.c5
-rw-r--r--arch/x86/kvm/vmx/nested.h3
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c3
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c6
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h4
-rw-r--r--arch/x86/kvm/vmx/vmx.c93
-rw-r--r--arch/x86/kvm/x86.c135
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/kvm/xen.c101
-rw-r--r--include/asm-generic/hyperv-tlfs.h7
-rw-r--r--kernel/static_call.c1
-rw-r--r--tools/testing/selftests/kvm/Makefile2
-rw-r--r--tools/testing/selftests/kvm/dirty_log_perf_test.c13
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/evmcs.h150
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/svm.h9
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/svm_util.h6
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/svm.c6
-rw-r--r--tools/testing/selftests/kvm/x86_64/evmcs_test.c64
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c29
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c175
-rw-r--r--tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c125
-rw-r--r--virt/kvm/kvm_main.c9
58 files changed, 2017 insertions, 1355 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f5a27f067db9..05161afd7642 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2339,13 +2339,35 @@
kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
Default is 0 (don't ignore, but inject #GP)
+ kvm.eager_page_split=
+ [KVM,X86] Controls whether or not KVM will try to
+ proactively split all huge pages during dirty logging.
+ Eager page splitting reduces interruptions to vCPU
+ execution by eliminating the write-protection faults
+ and MMU lock contention that would otherwise be
+ required to split huge pages lazily.
+
+ VM workloads that rarely perform writes or that write
+ only to a small region of VM memory may benefit from
+ disabling eager page splitting to allow huge pages to
+ still be used for reads.
+
+ The behavior of eager page splitting depends on whether
+ KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If
+ disabled, all huge pages in a memslot will be eagerly
+ split when dirty logging is enabled on that memslot. If
+ enabled, eager page splitting will be performed during
+ the KVM_CLEAR_DIRTY ioctl, and only for the pages being
+ cleared.
+
+ Eager page splitting currently only supports splitting
+ huge pages mapped by the TDP MMU.
+
+ Default is Y (on).
+
kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
Default is false (don't support).
- kvm.mmu_audit= [KVM] This is a R/W parameter which allows audit
- KVM MMU at runtime.
- Default is 0 (off)
-
kvm.nx_huge_pages=
[KVM] Controls the software workaround for the
X86_BUG_ITLB_MULTIHIT bug.
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index 7068da080799..49837d3a3ef5 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -248,6 +248,8 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
IRQCHIP_STATE_PENDING,
&val);
WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+ } else if (vgic_irq_is_mapped_level(irq)) {
+ val = vgic_get_phys_line_level(irq);
} else {
val = irq_is_pending(irq);
}
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index d39e0de06be2..29affccb353c 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -1,29 +1,30 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL)
+#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_OPTIONAL)
BUILD_BUG_ON(1)
#endif
/*
- * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate
- * "static_call()"s. They are also intended for use when defining
- * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those
- * functions that follow the [svm|vmx]_func_name convention.
- * KVM_X86_OP_NULL() can leave a NULL definition for the
- * case where there is no definition or a function name that
- * doesn't match the typical naming convention is supplied.
+ * KVM_X86_OP() and KVM_X86_OP_OPTIONAL() are used to help generate
+ * both DECLARE/DEFINE_STATIC_CALL() invocations and
+ * "static_call_update()" calls.
+ *
+ * KVM_X86_OP_OPTIONAL() can be used for those functions that can have
+ * a NULL definition, for example if "static_call_cond()" will be used
+ * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise
+ * to make a definition optional, but in this case the default will
+ * be __static_call_return0.
*/
-KVM_X86_OP_NULL(hardware_enable)
-KVM_X86_OP_NULL(hardware_disable)
-KVM_X86_OP_NULL(hardware_unsetup)
-KVM_X86_OP_NULL(cpu_has_accelerated_tpr)
+KVM_X86_OP(hardware_enable)
+KVM_X86_OP(hardware_disable)
+KVM_X86_OP(hardware_unsetup)
KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
KVM_X86_OP(vm_init)
-KVM_X86_OP_NULL(vm_destroy)
+KVM_X86_OP_OPTIONAL(vm_destroy)
KVM_X86_OP(vcpu_create)
KVM_X86_OP(vcpu_free)
KVM_X86_OP(vcpu_reset)
-KVM_X86_OP(prepare_guest_switch)
+KVM_X86_OP(prepare_switch_to_guest)
KVM_X86_OP(vcpu_load)
KVM_X86_OP(vcpu_put)
KVM_X86_OP(update_exception_bitmap)
@@ -33,9 +34,9 @@ KVM_X86_OP(get_segment_base)
KVM_X86_OP(get_segment)
KVM_X86_OP(get_cpl)
KVM_X86_OP(set_segment)
-KVM_X86_OP_NULL(get_cs_db_l_bits)
+KVM_X86_OP(get_cs_db_l_bits)
KVM_X86_OP(set_cr0)
-KVM_X86_OP_NULL(post_set_cr3)
+KVM_X86_OP_OPTIONAL(post_set_cr3)
KVM_X86_OP(is_valid_cr4)
KVM_X86_OP(set_cr4)
KVM_X86_OP(set_efer)
@@ -49,22 +50,22 @@ KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
KVM_X86_OP(set_rflags)
KVM_X86_OP(get_if_flag)
-KVM_X86_OP(tlb_flush_all)
-KVM_X86_OP(tlb_flush_current)
-KVM_X86_OP_NULL(tlb_remote_flush)
-KVM_X86_OP_NULL(tlb_remote_flush_with_range)
-KVM_X86_OP(tlb_flush_gva)
-KVM_X86_OP(tlb_flush_guest)
+KVM_X86_OP(flush_tlb_all)
+KVM_X86_OP(flush_tlb_current)
+KVM_X86_OP_OPTIONAL(tlb_remote_flush)
+KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range)
+KVM_X86_OP(flush_tlb_gva)
+KVM_X86_OP(flush_tlb_guest)
KVM_X86_OP(vcpu_pre_run)
-KVM_X86_OP(run)
-KVM_X86_OP_NULL(handle_exit)
-KVM_X86_OP_NULL(skip_emulated_instruction)
-KVM_X86_OP_NULL(update_emulated_instruction)
+KVM_X86_OP(vcpu_run)
+KVM_X86_OP(handle_exit)
+KVM_X86_OP(skip_emulated_instruction)
+KVM_X86_OP_OPTIONAL(update_emulated_instruction)
KVM_X86_OP(set_interrupt_shadow)
KVM_X86_OP(get_interrupt_shadow)
KVM_X86_OP(patch_hypercall)
-KVM_X86_OP(set_irq)
-KVM_X86_OP(set_nmi)
+KVM_X86_OP(inject_irq)
+KVM_X86_OP(inject_nmi)
KVM_X86_OP(queue_exception)
KVM_X86_OP(cancel_injection)
KVM_X86_OP(interrupt_allowed)
@@ -73,22 +74,22 @@ KVM_X86_OP(get_nmi_mask)
KVM_X86_OP(set_nmi_mask)
KVM_X86_OP(enable_nmi_window)
KVM_X86_OP(enable_irq_window)
-KVM_X86_OP(update_cr8_intercept)
+KVM_X86_OP_OPTIONAL(update_cr8_intercept)
KVM_X86_OP(check_apicv_inhibit_reasons)
KVM_X86_OP(refresh_apicv_exec_ctrl)
-KVM_X86_OP(hwapic_irr_update)
-KVM_X86_OP(hwapic_isr_update)
-KVM_X86_OP_NULL(guest_apic_has_interrupt)
-KVM_X86_OP(load_eoi_exitmap)
-KVM_X86_OP(set_virtual_apic_mode)
-KVM_X86_OP_NULL(set_apic_access_page_addr)
+KVM_X86_OP_OPTIONAL(hwapic_irr_update)
+KVM_X86_OP_OPTIONAL(hwapic_isr_update)
+KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt)
+KVM_X86_OP_OPTIONAL(load_eoi_exitmap)
+KVM_X86_OP_OPTIONAL(set_virtual_apic_mode)
+KVM_X86_OP_OPTIONAL(set_apic_access_page_addr)
KVM_X86_OP(deliver_interrupt)
-KVM_X86_OP_NULL(sync_pir_to_irr)
-KVM_X86_OP(set_tss_addr)
-KVM_X86_OP(set_identity_map_addr)
-KVM_X86_OP(get_mt_mask)
+KVM_X86_OP_OPTIONAL(sync_pir_to_irr)
+KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
+KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
+KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
-KVM_X86_OP_NULL(has_wbinvd_exit)
+KVM_X86_OP(has_wbinvd_exit)
KVM_X86_OP(get_l2_tsc_offset)
KVM_X86_OP(get_l2_tsc_multiplier)
KVM_X86_OP(write_tsc_offset)
@@ -96,32 +97,36 @@ KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
-KVM_X86_OP_NULL(request_immediate_exit)
+KVM_X86_OP(request_immediate_exit)
KVM_X86_OP(sched_in)
-KVM_X86_OP_NULL(update_cpu_dirty_logging)
-KVM_X86_OP_NULL(vcpu_blocking)
-KVM_X86_OP_NULL(vcpu_unblocking)
-KVM_X86_OP_NULL(update_pi_irte)
-KVM_X86_OP_NULL(start_assignment)
-KVM_X86_OP_NULL(apicv_post_state_restore)
-KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt)
-KVM_X86_OP_NULL(set_hv_timer)
-KVM_X86_OP_NULL(cancel_hv_timer)
+KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
+KVM_X86_OP_OPTIONAL(vcpu_blocking)
+KVM_X86_OP_OPTIONAL(vcpu_unblocking)
+KVM_X86_OP_OPTIONAL(pi_update_irte)
+KVM_X86_OP_OPTIONAL(pi_start_assignment)
+KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
+KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
+KVM_X86_OP_OPTIONAL(set_hv_timer)
+KVM_X86_OP_OPTIONAL(cancel_hv_timer)
KVM_X86_OP(setup_mce)
KVM_X86_OP(smi_allowed)
KVM_X86_OP(enter_smm)
KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
-KVM_X86_OP_NULL(mem_enc_op)
-KVM_X86_OP_NULL(mem_enc_reg_region)
-KVM_X86_OP_NULL(mem_enc_unreg_region)
+KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(mem_enc_register_region)
+KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
+KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
+KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
KVM_X86_OP(get_msr_feature)
KVM_X86_OP(can_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
-KVM_X86_OP_NULL(enable_direct_tlbflush)
-KVM_X86_OP_NULL(migrate_timers)
+KVM_X86_OP_OPTIONAL(enable_direct_tlbflush)
+KVM_X86_OP_OPTIONAL(migrate_timers)
KVM_X86_OP(msr_filter_changed)
-KVM_X86_OP_NULL(complete_emulated_msr)
+KVM_X86_OP(complete_emulated_msr)
+KVM_X86_OP(vcpu_deliver_sipi_vector)
#undef KVM_X86_OP
-#undef KVM_X86_OP_NULL
+#undef KVM_X86_OP_OPTIONAL
+#undef KVM_X86_OP_OPTIONAL_RET0
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6dcccb304775..884c926e4359 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1128,10 +1128,6 @@ struct kvm_arch {
struct kvm_hv hyperv;
struct kvm_xen xen;
- #ifdef CONFIG_KVM_MMU_AUDIT
- int audit_point;
- #endif
-
bool backwards_tsc_observed;
bool boot_vcpu_runs_old_kvmclock;
u32 bsp_vcpu_id;
@@ -1318,7 +1314,6 @@ struct kvm_x86_ops {
int (*hardware_enable)(void);
void (*hardware_disable)(void);
void (*hardware_unsetup)(void);
- bool (*cpu_has_accelerated_tpr)(void);
bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
@@ -1331,7 +1326,7 @@ struct kvm_x86_ops {
void (*vcpu_free)(struct kvm_vcpu *vcpu);
void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
- void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+ void (*prepare_switch_to_guest)(struct kvm_vcpu *vcpu);
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
@@ -1361,8 +1356,8 @@ struct kvm_x86_ops {
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
bool (*get_if_flag)(struct kvm_vcpu *vcpu);
- void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
- void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
+ void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
+ void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
int (*tlb_remote_flush)(struct kvm *kvm);
int (*tlb_remote_flush_with_range)(struct kvm *kvm,
struct kvm_tlb_range *range);
@@ -1373,16 +1368,16 @@ struct kvm_x86_ops {
* Can potentially get non-canonical addresses through INVLPGs, which
* the implementation may choose to ignore if appropriate.
*/
- void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
+ void (*flush_tlb_gva)(struct kvm_vcpu *vcpu, gva_t addr);
/*
* Flush any TLB entries created by the guest. Like tlb_flush_gva(),
* does not need to flush GPA->HPA mappings.
*/
- void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
+ void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
- enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
+ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1391,8 +1386,8 @@ struct kvm_x86_ops {
u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
- void (*set_irq)(struct kvm_vcpu *vcpu);
- void (*set_nmi)(struct kvm_vcpu *vcpu);
+ void (*inject_irq)(struct kvm_vcpu *vcpu);
+ void (*inject_nmi)(struct kvm_vcpu *vcpu);
void (*queue_exception)(struct kvm_vcpu *vcpu);
void (*cancel_injection)(struct kvm_vcpu *vcpu);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
@@ -1459,9 +1454,9 @@ struct kvm_x86_ops {
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
- int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
+ int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
- void (*start_assignment)(struct kvm *kvm);
+ void (*pi_start_assignment)(struct kvm *kvm);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
@@ -1476,9 +1471,9 @@ struct kvm_x86_ops {
int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
void (*enable_smi_window)(struct kvm_vcpu *vcpu);
- int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
- int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
- int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
+ int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
@@ -1541,15 +1536,22 @@ extern struct kvm_x86_ops kvm_x86_ops;
#define KVM_X86_OP(func) \
DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
-#define KVM_X86_OP_NULL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
static inline void kvm_ops_static_call_update(void)
{
-#define KVM_X86_OP(func) \
+#define __KVM_X86_OP(func) \
static_call_update(kvm_x86_##func, kvm_x86_ops.func);
-#define KVM_X86_OP_NULL KVM_X86_OP
+#define KVM_X86_OP(func) \
+ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func ? : \
+ (void *) __static_call_return0);
#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
}
#define __KVM_HAVE_ARCH_VM_ALLOC
@@ -1587,6 +1589,13 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
const struct kvm_memory_slot *memslot,
int start_level);
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int target_level);
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
@@ -1724,7 +1733,6 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -1878,7 +1886,7 @@ static inline bool kvm_is_supported_user_return_msr(u32 msr)
return kvm_find_user_return_msr(msr) >= 0;
}
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio);
+u64 kvm_scale_tsc(u64 tsc, u64 ratio);
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier);
u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3faf0f97edb1..a4a39c3e0f19 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -476,6 +476,7 @@
#define MSR_AMD64_ICIBSEXTDCTL 0xc001103c
#define MSR_AMD64_IBSOPDATA4 0xc001103d
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
+#define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b
#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e
#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
#define MSR_AMD64_SEV 0xc0010131
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index b00dbc5fac2b..bb2fb78523ce 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -220,6 +220,42 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
#define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2)
+
+/* AVIC */
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFF)
+
+#define AVIC_DOORBELL_PHYSICAL_ID_MASK (0xFF)
+
+#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
+#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
+#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
+
+enum avic_ipi_failure_cause {
+ AVIC_IPI_FAILURE_INVALID_INT_TYPE,
+ AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
+ AVIC_IPI_FAILURE_INVALID_TARGET,
+ AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
+};
+
+
+/*
+ * 0xff is broadcast, so the max index allowed for physical APIC ID
+ * table is 0xfe. APIC IDs above 0xff are reserved.
+ */
+#define AVIC_MAX_PHYSICAL_ID_COUNT 0xff
+
+#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
+
struct vmcb_seg {
u16 selector;
u16 attrib;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2b1548da00eb..e3cbd7706136 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -126,13 +126,6 @@ config KVM_XEN
If in doubt, say "N".
-config KVM_MMU_AUDIT
- bool "Audit KVM MMU"
- depends on KVM && TRACEPOINTS
- help
- This option adds a R/W kVM module parameter 'mmu_audit', which allows
- auditing of KVM MMU events at runtime.
-
config KVM_EXTERNAL_WRITE_TRACKING
bool
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 494d4d351859..ff756cdc31ce 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -712,9 +712,17 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
entry = &array->entries[array->nent++];
+ memset(entry, 0, sizeof(*entry));
entry->function = function;
entry->index = index;
- entry->flags = 0;
+ switch (function & 0xC0000000) {
+ case 0x40000000:
+ /* Hypervisor leaves are always synthesized by __do_cpuid_func. */
+ return entry;
+
+ default:
+ break;
+ }
cpuid_count(entry->function, entry->index,
&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5719d8cfdbd9..b7990defca9c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2600,8 +2600,7 @@ emulate_shutdown:
}
static void
-setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
- struct desc_struct *cs, struct desc_struct *ss)
+setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss)
{
cs->l = 0; /* will be adjusted later */
set_desc_base(cs, 0); /* flat segment */
@@ -2690,7 +2689,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
if (!(efer & EFER_SCE))
return emulate_ud(ctxt);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
ops->get_msr(ctxt, MSR_STAR, &msr_data);
msr_data >>= 32;
cs_sel = (u16)(msr_data & 0xfffc);
@@ -2758,7 +2757,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
if ((msr_data & 0xfffc) == 0x0)
return emulate_gp(ctxt, 0);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
ss_sel = cs_sel + 8;
@@ -2795,7 +2794,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_gp(ctxt, 0);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
if ((ctxt->rex_prefix & 0x8) != 0x0)
usermode = X86EMUL_MODE_PROT64;
@@ -3013,8 +3012,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
-static int task_switch_16(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, u16 old_tss_sel,
+static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
struct tss_segment_16 tss_seg;
@@ -3152,8 +3150,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
return ret;
}
-static int task_switch_32(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, u16 old_tss_sel,
+static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
struct tss_segment_32 tss_seg;
@@ -3261,10 +3258,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
old_tss_sel = 0xffff;
if (next_tss_desc.type & 8)
- ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
- old_tss_base, &next_tss_desc);
+ ret = task_switch_32(ctxt, old_tss_sel, old_tss_base, &next_tss_desc);
else
- ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
+ ret = task_switch_16(ctxt, old_tss_sel,
old_tss_base, &next_tss_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 6e38a7d22e97..653e08c993c4 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -112,6 +112,9 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
if (!!auto_eoi_old == !!auto_eoi_new)
return;
+ if (!enable_apicv)
+ return;
+
down_write(&vcpu->kvm->arch.apicv_update_lock);
if (auto_eoi_new)
@@ -1710,32 +1713,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
return kvm_hv_get_msr(vcpu, msr, pdata, host);
}
-static __always_inline unsigned long *sparse_set_to_vcpu_mask(
- struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask,
- u64 *vp_bitmap, unsigned long *vcpu_bitmap)
+static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks,
+ u64 valid_bank_mask, unsigned long *vcpu_mask)
{
struct kvm_hv *hv = to_kvm_hv(kvm);
+ bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes);
+ u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
struct kvm_vcpu *vcpu;
int bank, sbank = 0;
unsigned long i;
+ u64 *bitmap;
+
+ BUILD_BUG_ON(sizeof(vp_bitmap) >
+ sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS));
- memset(vp_bitmap, 0,
- KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap));
+ /*
+ * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else
+ * fill a temporary buffer and manually test each vCPU's VP index.
+ */
+ if (likely(!has_mismatch))
+ bitmap = (u64 *)vcpu_mask;
+ else
+ bitmap = vp_bitmap;
+
+ /*
+ * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask
+ * having a '1' for each bank that exists in sparse_banks. Sets must
+ * be in ascending order, i.e. bank0..bankN.
+ */
+ memset(bitmap, 0, sizeof(vp_bitmap));
for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
KVM_HV_MAX_SPARSE_VCPU_SET_BITS)
- vp_bitmap[bank] = sparse_banks[sbank++];
+ bitmap[bank] = sparse_banks[sbank++];
- if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) {
- /* for all vcpus vp_index == vcpu_idx */
- return (unsigned long *)vp_bitmap;
- }
+ if (likely(!has_mismatch))
+ return;
- bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
kvm_for_each_vcpu(i, vcpu, kvm) {
if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap))
- __set_bit(i, vcpu_bitmap);
+ __set_bit(i, vcpu_mask);
}
- return vcpu_bitmap;
}
struct kvm_hv_hcall {
@@ -1743,6 +1761,7 @@ struct kvm_hv_hcall {
u64 ingpa;
u64 outgpa;
u16 code;
+ u16 var_cnt;
u16 rep_cnt;
u16 rep_idx;
bool fast;
@@ -1750,21 +1769,40 @@ struct kvm_hv_hcall {
sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
};
+static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u64 *sparse_banks, gpa_t offset)
+{
+ u16 var_cnt;
+
+ if (hc->var_cnt > 64)
+ return -EINVAL;
+
+ /* Ignore banks that cannot possibly contain a legal VP index. */
+ var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS);
+
+ return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks,
+ var_cnt * sizeof(*sparse_banks));
+}
+
static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex)
{
int i;
- gpa_t gpa;
struct kvm *kvm = vcpu->kvm;
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
- u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
- DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
- unsigned long *vcpu_mask;
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
u64 valid_bank_mask;
- u64 sparse_banks[64];
- int sparse_banks_len;
+ u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
bool all_cpus;
+ /*
+ * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the
+ * valid mask is a u64. Fail the build if KVM's max allowed number of
+ * vCPUs (>4096) would exceed this limit, KVM will additional changes
+ * for Hyper-V support to avoid setting the guest up to fail.
+ */
+ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64);
+
if (!ex) {
if (hc->fast) {
flush.address_space = hc->ingpa;
@@ -1812,30 +1850,32 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
all_cpus = flush_ex.hv_vp_set.format !=
HV_GENERIC_SET_SPARSE_4K;
- sparse_banks_len = bitmap_weight((unsigned long *)&valid_bank_mask, 64);
+ if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (all_cpus)
+ goto do_flush;
- if (!sparse_banks_len && !all_cpus)
+ if (!hc->var_cnt)
goto ret_success;
- if (!all_cpus) {
- if (hc->fast) {
- if (sparse_banks_len > HV_HYPERCALL_MAX_XMM_REGISTERS - 1)
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
- for (i = 0; i < sparse_banks_len; i += 2) {
- sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]);
- sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]);
- }
- } else {
- gpa = hc->ingpa + offsetof(struct hv_tlb_flush_ex,
- hv_vp_set.bank_contents);
- if (unlikely(kvm_read_guest(kvm, gpa, sparse_banks,
- sparse_banks_len *
- sizeof(sparse_banks[0]))))
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ if (hc->fast) {
+ if (hc->var_cnt > HV_HYPERCALL_MAX_XMM_REGISTERS - 1)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ for (i = 0; i < hc->var_cnt; i += 2) {
+ sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]);
+ sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]);
}
+ goto do_flush;
}
+
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks,
+ offsetof(struct hv_tlb_flush_ex,
+ hv_vp_set.bank_contents)))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
+do_flush:
/*
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
@@ -1843,11 +1883,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
if (all_cpus) {
kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
} else {
- vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
- vp_bitmap, vcpu_bitmap);
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST,
- vcpu_mask);
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask);
}
ret_success:
@@ -1880,12 +1918,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
struct kvm *kvm = vcpu->kvm;
struct hv_send_ipi_ex send_ipi_ex;
struct hv_send_ipi send_ipi;
- u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
- DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
- unsigned long *vcpu_mask;
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
unsigned long valid_bank_mask;
- u64 sparse_banks[64];
- int sparse_banks_len;
+ u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
u32 vector;
bool all_cpus;
@@ -1918,22 +1953,20 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
vector = send_ipi_ex.vector;
valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
- sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
- sizeof(sparse_banks[0]);
-
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+ if (hc->var_cnt != bitmap_weight(&valid_bank_mask, 64))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
if (all_cpus)
goto check_and_send_ipi;
- if (!sparse_banks_len)
+ if (!hc->var_cnt)
goto ret_success;
- if (kvm_read_guest(kvm,
- hc->ingpa + offsetof(struct hv_send_ipi_ex,
- vp_set.bank_contents),
- sparse_banks,
- sparse_banks_len))
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks,
+ offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents)))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
@@ -1941,11 +1974,13 @@ check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- vcpu_mask = all_cpus ? NULL :
- sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
- vp_bitmap, vcpu_bitmap);
+ if (all_cpus) {
+ kvm_send_ipi_to_many(kvm, vector, NULL);
+ } else {
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
- kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
+ kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
+ }
ret_success:
return HV_STATUS_SUCCESS;
@@ -2017,11 +2052,6 @@ int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce)
return ret;
}
-bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
-}
-
static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
{
bool longmode;
@@ -2191,19 +2221,25 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
hc.code = hc.param & 0xffff;
+ hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET;
hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT);
hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
hc.rep = !!(hc.rep_cnt || hc.rep_idx);
- trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx,
- hc.ingpa, hc.outgpa);
+ trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt,
+ hc.rep_idx, hc.ingpa, hc.outgpa);
if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) {
ret = HV_STATUS_ACCESS_DENIED;
goto hypercall_complete;
}
+ if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ goto hypercall_complete;
+ }
+
if (hc.fast && is_xmm_fast_hypercall(&hc)) {
if (unlikely(hv_vcpu->enforce_cpuid &&
!(hv_vcpu->cpuid_cache.features_edx &
@@ -2217,14 +2253,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
switch (hc.code) {
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
- if (unlikely(hc.rep)) {
+ if (unlikely(hc.rep || hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
kvm_vcpu_on_spin(vcpu, true);
break;
case HVCALL_SIGNAL_EVENT:
- if (unlikely(hc.rep)) {
+ if (unlikely(hc.rep || hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
@@ -2234,7 +2270,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
fallthrough; /* maybe userspace knows this conn_id */
case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
- if (unlikely(hc.rep || !to_hv_synic(vcpu)->active)) {
+ if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
@@ -2247,14 +2283,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
kvm_hv_hypercall_complete_userspace;
return 0;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
- if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
+ if (unlikely(!hc.rep_cnt || hc.rep_idx || hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
ret = kvm_hv_flush_tlb(vcpu, &hc, false);
break;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
- if (unlikely(hc.rep)) {
+ if (unlikely(hc.rep || hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
@@ -2275,7 +2311,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
ret = kvm_hv_flush_tlb(vcpu, &hc, true);
break;
case HVCALL_SEND_IPI:
- if (unlikely(hc.rep)) {
+ if (unlikely(hc.rep || hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
@@ -2417,10 +2453,6 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
if (kvm_x86_ops.nested_ops->get_evmcs_version)
evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu);
- /* Skip NESTED_FEATURES if eVMCS is not supported */
- if (!evmcs_ver)
- --nent;
-
if (cpuid->nent < nent)
return -E2BIG;
@@ -2520,8 +2552,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_NESTED_FEATURES:
ent->eax = evmcs_ver;
- if (evmcs_ver)
- ent->eax |= HV_X64_NESTED_MSR_BITMAP;
+ ent->eax |= HV_X64_NESTED_MSR_BITMAP;
break;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index ed1c4e546d04..e19c00ee9ab3 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -89,7 +89,11 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
-bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu);
+static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
+}
+
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 814064d06016..be99dc86293d 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -437,13 +437,13 @@ static u32 pic_ioport_read(void *opaque, u32 addr)
return ret;
}
-static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+static void elcr_ioport_write(void *opaque, u32 val)
{
struct kvm_kpic_state *s = opaque;
s->elcr = val & s->elcr_mask;
}
-static u32 elcr_ioport_read(void *opaque, u32 addr1)
+static u32 elcr_ioport_read(void *opaque)
{
struct kvm_kpic_state *s = opaque;
return s->elcr;
@@ -474,7 +474,7 @@ static int picdev_write(struct kvm_pic *s,
case 0x4d0:
case 0x4d1:
pic_lock(s);
- elcr_ioport_write(&s->pics[addr & 1], addr, data);
+ elcr_ioport_write(&s->pics[addr & 1], data);
pic_unlock(s);
break;
default:
@@ -505,7 +505,7 @@ static int picdev_read(struct kvm_pic *s,
case 0x4d0:
case 0x4d1:
pic_lock(s);
- *data = elcr_ioport_read(&s->pics[addr & 1], addr);
+ *data = elcr_ioport_read(&s->pics[addr & 1]);
pic_unlock(s);
break;
default:
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index decfa36b7891..765943d7cfa5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -54,9 +54,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
int trigger_mode,
int pin);
-static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
- unsigned long addr,
- unsigned long length)
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic)
{
unsigned long result = 0;
@@ -593,7 +591,7 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
break;
case IOAPIC_REG_WINDOW:
- result = ioapic_read_indirect(ioapic, addr, len);
+ result = ioapic_read_indirect(ioapic);
break;
default:
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
index b469f45e3fe4..ee4f696a0782 100644
--- a/arch/x86/kvm/kvm_onhyperv.c
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -92,3 +92,17 @@ int hv_remote_flush_tlb(struct kvm *kvm)
return hv_remote_flush_tlb_with_range(kvm, NULL);
}
EXPORT_SYMBOL_GPL(hv_remote_flush_tlb);
+
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+ struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
+
+ if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+ vcpu->arch.hv_root_tdp = root_tdp;
+ if (root_tdp != kvm_arch->hv_root_tdp)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(hv_track_root_tdp);
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
index 1c67abf2eba9..287e98ef9df3 100644
--- a/arch/x86/kvm/kvm_onhyperv.h
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -10,19 +10,7 @@
int hv_remote_flush_tlb_with_range(struct kvm *kvm,
struct kvm_tlb_range *range);
int hv_remote_flush_tlb(struct kvm *kvm);
-
-static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
-{
- struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
-
- if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
- spin_lock(&kvm_arch->hv_root_tdp_lock);
- vcpu->arch.hv_root_tdp = root_tdp;
- if (root_tdp != kvm_arch->hv_root_tdp)
- kvm_arch->hv_root_tdp = INVALID_PAGE;
- spin_unlock(&kvm_arch->hv_root_tdp_lock);
- }
-}
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
#else /* !CONFIG_HYPERV */
static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d7e6fde82d25..47f8606559a9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
{
- return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
+ return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+ (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
}
bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
@@ -491,8 +492,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
if (unlikely(vcpu->arch.apicv_active)) {
/* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- static_call(kvm_x86_hwapic_irr_update)(vcpu,
- apic_find_highest_irr(apic));
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -522,7 +522,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(vcpu->arch.apicv_active))
- static_call(kvm_x86_hwapic_isr_update)(vcpu, vec);
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -570,8 +570,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(vcpu->arch.apicv_active))
- static_call(kvm_x86_hwapic_isr_update)(vcpu,
- apic_find_highest_isr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -2287,7 +2286,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
- static_call(kvm_x86_set_virtual_apic_mode)(vcpu);
+ static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -2306,7 +2305,12 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
apic->irr_pending = true;
apic->isr_count = 1;
} else {
- apic->irr_pending = (apic_search_irr(apic) != -1);
+ /*
+ * Don't clear irr_pending, searching the IRR can race with
+ * updates from the CPU as APICv is still active from hardware's
+ * perspective. The flag will be cleared as appropriate when
+ * KVM injects the interrupt.
+ */
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
}
}
@@ -2368,9 +2372,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (vcpu->arch.apicv_active) {
- static_call(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call(kvm_x86_hwapic_irr_update)(vcpu, -1);
- static_call(kvm_x86_hwapic_isr_update)(vcpu, -1);
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2633,11 +2637,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
- static_call(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call(kvm_x86_hwapic_irr_update)(vcpu,
- apic_find_highest_irr(apic));
- static_call(kvm_x86_hwapic_isr_update)(vcpu,
- apic_find_highest_isr(apic));
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
@@ -2928,7 +2930,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
/* evaluate pending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
- kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector);
+ static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e9fbb2c8bbe2..51faa2c76ca5 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -203,44 +203,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
}
/*
- * Currently, we have two sorts of write-protection, a) the first one
- * write-protects guest page to sync the guest modification, b) another one is
- * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
- * between these two sorts are:
- * 1) the first case clears MMU-writable bit.
- * 2) the first case requires flushing tlb immediately avoiding corrupting
- * shadow page table between all vcpus so it should be in the protection of
- * mmu-lock. And the another case does not need to flush tlb until returning
- * the dirty bitmap to userspace since it only write-protects the page
- * logged in the bitmap, that means the page in the dirty bitmap is not
- * missed, so it can flush tlb out of mmu-lock.
- *
- * So, there is the problem: the first case can meet the corrupted tlb caused
- * by another case which write-protects pages but without flush tlb
- * immediately. In order to making the first case be aware this problem we let
- * it flush tlb if we try to write-protect a spte whose MMU-writable bit
- * is set, it works since another case never touches MMU-writable bit.
- *
- * Anyway, whenever a spte is updated (only permission and status bits are
- * changed) we need to check whether the spte with MMU-writable becomes
- * readonly, if that happens, we need to flush tlb. Fortunately,
- * mmu_spte_update() has already handled it perfectly.
- *
- * The rules to use MMU-writable and PT_WRITABLE_MASK:
- * - if we want to see if it has writable tlb entry or if the spte can be
- * writable on the mmu mapping, check MMU-writable, this is the most
- * case, otherwise
- * - if we fix page fault on the spte or do write-protection by dirty logging,
- * check PT_WRITABLE_MASK.
- *
- * TODO: introduce APIs to split these two cases.
- */
-static inline bool is_writable_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
-/*
* Check if a given access (described through the I/D, W/R and U/S bits of a
* page fault error code pfec) causes a permission fault with the given PTE
* access rights (in ACC_* format).
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 593093b52395..0620480b99e0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -104,15 +104,6 @@ static int max_huge_page_level __read_mostly;
static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
-enum {
- AUDIT_PRE_PAGE_FAULT,
- AUDIT_POST_PAGE_FAULT,
- AUDIT_PRE_PTE_WRITE,
- AUDIT_POST_PTE_WRITE,
- AUDIT_PRE_SYNC,
- AUDIT_POST_SYNC
-};
-
#ifdef MMU_DEBUG
bool dbg = 0;
module_param(dbg, bool, 0644);
@@ -529,6 +520,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
u64 old_spte = *sptep;
WARN_ON(!is_shadow_present_pte(new_spte));
+ check_spte_writable_invariants(new_spte);
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
@@ -548,11 +540,9 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
/* Rules for using mmu_spte_update:
* Update the state bits, it means the mapped pfn is not changed.
*
- * Whenever we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB, the return value indicates this
- * case.
+ * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
+ * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
+ * spte, even though the writable spte might be cached on a CPU's TLB.
*
* Returns true if the TLB needs to be flushed
*/
@@ -646,24 +636,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
return __get_spte_lockless(sptep);
}
-/* Restore an acc-track PTE back to a regular PTE */
-static u64 restore_acc_track_spte(u64 spte)
-{
- u64 new_spte = spte;
- u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
- & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
-
- WARN_ON_ONCE(spte_ad_enabled(spte));
- WARN_ON_ONCE(!is_access_track_spte(spte));
-
- new_spte &= ~shadow_acc_track_mask;
- new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
- SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
- new_spte |= saved_bits;
-
- return new_spte;
-}
-
/* Returns the Accessed status of the PTE and resets it at the same time. */
static bool mmu_spte_age(u64 *sptep)
{
@@ -1229,9 +1201,8 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_write_protect(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- bool pt_protect)
+static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
+ bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1311,7 +1282,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PG_LEVEL_4K, slot);
- __rmap_write_protect(kvm, rmap_head, false);
+ rmap_write_protect(rmap_head, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1378,6 +1349,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+
kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
/* Cross two large pages? */
@@ -1410,7 +1384,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
if (kvm_memslots_have_rmaps(kvm)) {
for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
rmap_head = gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ write_protected |= rmap_write_protect(rmap_head, true);
}
}
@@ -1421,7 +1395,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
return write_protected;
}
-static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
{
struct kvm_memory_slot *slot;
@@ -1921,13 +1895,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
return true;
}
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
-static void mmu_audit_disable(void) { }
-#endif
-
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
if (sp->role.invalid)
@@ -2024,7 +1991,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
bool protected = false;
for_each_sp(pages, sp, parents, i)
- protected |= rmap_write_protect(vcpu, sp->gfn);
+ protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
if (protected) {
kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
@@ -2149,7 +2116,7 @@ trace_get_page:
hlist_add_head(&sp->hash_link, sp_list);
if (!direct) {
account_shadowed(vcpu->kvm, sp);
- if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
+ if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
}
trace_kvm_mmu_get_page(sp, true);
@@ -2307,7 +2274,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm,
return zapped;
}
-static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -2351,7 +2318,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
- kvm_mmu_unlink_parents(kvm, sp);
+ kvm_mmu_unlink_parents(sp);
/* Zapping children means active_mmu_pages has become unstable. */
list_unstable = *nr_zapped;
@@ -3687,17 +3654,12 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
return;
write_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
-
mmu_sync_children(vcpu, sp, true);
-
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
write_unlock(&vcpu->kvm->mmu_lock);
return;
}
write_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu->pae_root[i];
@@ -3709,7 +3671,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
}
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
write_unlock(&vcpu->kvm->mmu_lock);
}
@@ -4474,8 +4435,7 @@ static inline bool boot_cpu_is_amd(void)
* possible, however, kvm currently does not do execution-protection.
*/
static void
-reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
{
struct rsvd_bits_validate *shadow_zero_check;
int i;
@@ -4506,8 +4466,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
* is the shadow page table for intel nested guest.
*/
static void
-reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context, bool execonly)
+reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
reserved_hpa_bits(), execonly,
@@ -4794,7 +4753,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->gva_to_gpa = paging32_gva_to_gpa;
reset_guest_paging_metadata(vcpu, context);
- reset_tdp_shadow_zero_bits_mask(vcpu, context);
+ reset_tdp_shadow_zero_bits_mask(context);
}
static union kvm_mmu_role
@@ -4948,7 +4907,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
update_permission_bitmask(context, true);
context->pkru_mask = 0;
reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
- reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
+ reset_ept_shadow_zero_bits_mask(context, execonly);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
@@ -5100,7 +5059,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
kvm_mmu_sync_roots(vcpu);
kvm_mmu_load_pgd(vcpu);
- static_call(kvm_x86_tlb_flush_current)(vcpu);
+ static_call(kvm_x86_flush_tlb_current)(vcpu);
out:
return r;
}
@@ -5260,7 +5219,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
++vcpu->kvm->stat.mmu_pte_write;
- kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (detect_write_misaligned(sp, gpa, bytes) ||
@@ -5285,7 +5243,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
}
kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
- kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
write_unlock(&vcpu->kvm->mmu_lock);
}
@@ -5360,7 +5317,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (is_noncanonical_address(gva, vcpu))
return;
- static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
}
if (!mmu->invlpg)
@@ -5416,7 +5373,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
}
if (tlb_flush)
- static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
++vcpu->stat.invlpg;
@@ -5802,7 +5759,7 @@ static bool slot_rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
const struct kvm_memory_slot *slot)
{
- return __rmap_write_protect(kvm, rmap_head, false);
+ return rmap_write_protect(rmap_head, false);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -5846,12 +5803,52 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
* will clear a separate software-only bit (MMU-writable) and skip the
* flush if-and-only-if this bit was already clear.
*
- * See DEFAULT_SPTE_MMU_WRITEABLE for more details.
+ * See is_writable_pte() for more details.
*/
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
}
+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level)
+{
+ if (is_tdp_mmu_enabled(kvm))
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
+ target_level, false);
+
+ /*
+ * A TLB flush is unnecessary at this point for the same resons as in
+ * kvm_mmu_slot_try_split_huge_pages().
+ */
+}
+
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int target_level)
+{
+ u64 start = memslot->base_gfn;
+ u64 end = start + memslot->npages;
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
+ read_unlock(&kvm->mmu_lock);
+ }
+
+ /*
+ * No TLB flush is necessary here. KVM will flush TLBs after
+ * write-protecting and/or clearing dirty on the newly split SPTEs to
+ * ensure that guest writes are reflected in the dirty log before the
+ * ioctl to enable dirty logging on this memslot completes. Since the
+ * split SPTEs retain the write and dirty bits of the huge SPTE, it is
+ * safe for KVM to decide if a TLB flush is necessary based on the split
+ * SPTEs.
+ */
+}
+
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
const struct kvm_memory_slot *slot)
@@ -6191,7 +6188,6 @@ void kvm_mmu_module_exit(void)
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
unregister_shrinker(&mmu_shrinker);
- mmu_audit_disable();
}
/*
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
deleted file mode 100644
index 9e7dcf999f08..000000000000
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ /dev/null
@@ -1,303 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * mmu_audit.c:
- *
- * Audit code for KVM MMU
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Yaniv Kamay <[email protected]>
- * Avi Kivity <[email protected]>
- * Marcelo Tosatti <[email protected]>
- * Xiao Guangrong <[email protected]>
- */
-
-#include <linux/ratelimit.h>
-
-static char const *audit_point_name[] = {
- "pre page fault",
- "post page fault",
- "pre pte write",
- "post pte write",
- "pre sync",
- "post sync"
-};
-
-#define audit_printk(kvm, fmt, args...) \
- printk(KERN_ERR "audit: (%s) error: " \
- fmt, audit_point_name[kvm->arch.audit_point], ##args)
-
-typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
-
-static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- inspect_spte_fn fn, int level)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 *ent = sp->spt;
-
- fn(vcpu, ent + i, level);
-
- if (is_shadow_present_pte(ent[i]) &&
- !is_last_spte(ent[i], level)) {
- struct kvm_mmu_page *child;
-
- child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(vcpu, child, fn, level - 1);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- return;
-
- if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
- hpa_t root = vcpu->arch.mmu->root_hpa;
-
- sp = to_shadow_page(root);
- __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level);
- return;
- }
-
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu->pae_root[i];
-
- if (IS_VALID_PAE_ROOT(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = to_shadow_page(root);
- __mmu_spte_walk(vcpu, sp, fn, 2);
- }
- }
-
- return;
-}
-
-typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
-
-static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
-{
- struct kvm_mmu_page *sp;
-
- list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
- fn(kvm, sp);
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- kvm_pfn_t pfn;
- hpa_t hpa;
-
- sp = sptep_to_sp(sptep);
-
- if (sp->unsync) {
- if (level != PG_LEVEL_4K) {
- audit_printk(vcpu->kvm, "unsync sp: %p "
- "level = %d\n", sp, level);
- return;
- }
- }
-
- if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
- return;
-
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
-
- if (is_error_pfn(pfn))
- return;
-
- hpa = pfn << PAGE_SHIFT;
- if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
- "ent %llxn", vcpu->arch.mmu->root_level, pfn,
- hpa, *sptep);
-}
-
-static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
- struct kvm_rmap_head *rmap_head;
- struct kvm_mmu_page *rev_sp;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
- gfn_t gfn;
-
- rev_sp = sptep_to_sp(sptep);
- gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
- slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
- slot = __gfn_to_memslot(slots, gfn);
- if (!slot) {
- if (!__ratelimit(&ratelimit_state))
- return;
- audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
- audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
- (long int)(sptep - rev_sp->spt), rev_sp->gfn);
- dump_stack();
- return;
- }
-
- rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot);
- if (!rmap_head->val) {
- if (!__ratelimit(&ratelimit_state))
- return;
- audit_printk(kvm, "no rmap for writable spte %llx\n",
- *sptep);
- dump_stack();
- }
-}
-
-static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
- inspect_spte_has_rmap(vcpu->kvm, sptep);
-}
-
-static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp = sptep_to_sp(sptep);
-
- if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
- audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
- "root.\n", sp);
-}
-
-static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- int i;
-
- if (sp->role.level != PG_LEVEL_4K)
- return;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_shadow_present_pte(sp->spt[i]))
- continue;
-
- inspect_spte_has_rmap(kvm, sp->spt + i);
- }
-}
-
-static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- struct kvm_rmap_head *rmap_head;
- u64 *sptep;
- struct rmap_iterator iter;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
-
- if (sp->role.direct || sp->unsync || sp->role.invalid)
- return;
-
- slots = kvm_memslots_for_spte_role(kvm, sp->role);
- slot = __gfn_to_memslot(slots, sp->gfn);
- rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
-
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- if (is_writable_pte(*sptep))
- audit_printk(kvm, "shadow page has writable "
- "mappings: gfn %llx role %x\n",
- sp->gfn, sp->role.word);
- }
-}
-
-static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- check_mappings_rmap(kvm, sp);
- audit_write_protection(kvm, sp);
-}
-
-static void audit_all_active_sps(struct kvm *kvm)
-{
- walk_all_active_sps(kvm, audit_sp);
-}
-
-static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- audit_sptes_have_rmaps(vcpu, sptep, level);
- audit_mappings(vcpu, sptep, level);
- audit_spte_after_sync(vcpu, sptep, level);
-}
-
-static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
-{
- mmu_spte_walk(vcpu, audit_spte);
-}
-
-static bool mmu_audit;
-static DEFINE_STATIC_KEY_FALSE(mmu_audit_key);
-
-static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
-
- if (!__ratelimit(&ratelimit_state))
- return;
-
- vcpu->kvm->arch.audit_point = point;
- audit_all_active_sps(vcpu->kvm);
- audit_vcpu_spte(vcpu);
-}
-
-static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
-{
- if (static_branch_unlikely((&mmu_audit_key)))
- __kvm_mmu_audit(vcpu, point);
-}
-
-static void mmu_audit_enable(void)
-{
- if (mmu_audit)
- return;
-
- static_branch_inc(&mmu_audit_key);
- mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
- if (!mmu_audit)
- return;
-
- static_branch_dec(&mmu_audit_key);
- mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
- int ret;
- unsigned long enable;
-
- ret = kstrtoul(val, 10, &enable);
- if (ret < 0)
- return -EINVAL;
-
- switch (enable) {
- case 0:
- mmu_audit_disable();
- break;
- case 1:
- mmu_audit_enable();
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static const struct kernel_param_ops audit_param_ops = {
- .set = mmu_audit_set,
- .get = param_get_bool,
-};
-
-arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index de5e8e4e1aa7..12247b96af01 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -416,6 +416,29 @@ TRACE_EVENT(
)
);
+TRACE_EVENT(
+ kvm_mmu_split_huge_page,
+ TP_PROTO(u64 gfn, u64 spte, int level, int errno),
+ TP_ARGS(gfn, spte, level, errno),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(int, level)
+ __field(int, errno)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = spte;
+ __entry->level = level;
+ __entry->errno = errno;
+ ),
+
+ TP_printk("gfn %llx spte %llx level %d errno %d",
+ __entry->gfn, __entry->spte, __entry->level, __entry->errno)
+);
+
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 5b5bdac97c7b..aa0e3c246aca 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -904,12 +904,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
- kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
r = FNAME(fetch)(vcpu, fault, &walker);
- kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 73cfe62fdad1..4739b53c9734 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -192,6 +192,65 @@ out:
return wrprot;
}
+static u64 make_spte_executable(u64 spte)
+{
+ bool is_access_track = is_access_track_spte(spte);
+
+ if (is_access_track)
+ spte = restore_acc_track_spte(spte);
+
+ spte &= ~shadow_nx_mask;
+ spte |= shadow_x_mask;
+
+ if (is_access_track)
+ spte = mark_spte_for_access_track(spte);
+
+ return spte;
+}
+
+/*
+ * Construct an SPTE that maps a sub-page of the given huge page SPTE where
+ * `index` identifies which sub-page.
+ *
+ * This is used during huge page splitting to build the SPTEs that make up the
+ * new page table.
+ */
+u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
+{
+ u64 child_spte;
+ int child_level;
+
+ if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
+ return 0;
+
+ if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
+ return 0;
+
+ child_spte = huge_spte;
+ child_level = huge_level - 1;
+
+ /*
+ * The child_spte already has the base address of the huge page being
+ * split. So we just have to OR in the offset to the page at the next
+ * lower level for the given index.
+ */
+ child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;
+
+ if (child_level == PG_LEVEL_4K) {
+ child_spte &= ~PT_PAGE_SIZE_MASK;
+
+ /*
+ * When splitting to a 4K page, mark the page executable as the
+ * NX hugepage mitigation no longer applies.
+ */
+ if (is_nx_huge_page_enabled())
+ child_spte = make_spte_executable(child_spte);
+ }
+
+ return child_spte;
+}
+
+
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
{
u64 spte = SPTE_MMU_PRESENT_MASK;
@@ -250,14 +309,7 @@ u64 mark_spte_for_access_track(u64 spte)
if (is_access_track_spte(spte))
return spte;
- /*
- * Making an Access Tracking PTE will result in removal of write access
- * from the PTE. So, verify that we will be able to restore the write
- * access in the fast page fault path later on.
- */
- WARN_ONCE((spte & PT_WRITABLE_MASK) &&
- !spte_can_locklessly_be_made_writable(spte),
- "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+ check_spte_writable_invariants(spte);
WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
@@ -368,8 +420,8 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_acc_track_mask = 0;
shadow_me_mask = sme_me_mask;
- shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
- shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE;
+ shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE;
/*
* Set a reserved PA bit in MMIO SPTEs to generate page faults with
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index be6a007a4af3..73f12615416f 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -75,33 +75,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
/*
- * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
- * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
- * that map guest pages in read-only memslots and read-only VMAs.
- *
- * Invariants:
- * - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
- *
- *
- * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
- * allows writes to the guest page mapped by the SPTE. This bit is cleared when
- * the guest page mapped by the SPTE contains a page table that is being
- * monitored for shadow paging. In this case the SPTE can only be made writable
- * by unsyncing the shadow page under the mmu_lock.
- *
- * Invariants:
- * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
- * - If MMU-writable is set, Host-writable must be set.
- *
- * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
- * to track writes for dirty logging. For such SPTEs, KVM will locklessly set
- * PT_WRITABLE_MASK upon the next write from the guest and record the write in
- * the dirty log (see fast_page_fault()).
+ * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given
+ * SPTE is write-protected. See is_writable_pte() for details.
*/
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
-#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
-#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
+#define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10)
/*
* Low ignored bits are at a premium for EPT, use high ignored bits, taking care
@@ -339,15 +319,86 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
__is_rsvd_bits_set(rsvd_check, spte, level);
}
-static inline bool spte_can_locklessly_be_made_writable(u64 spte)
+/*
+ * An shadow-present leaf SPTE may be non-writable for 3 possible reasons:
+ *
+ * 1. To intercept writes for dirty logging. KVM write-protects huge pages
+ * so that they can be split be split down into the dirty logging
+ * granularity (4KiB) whenever the guest writes to them. KVM also
+ * write-protects 4KiB pages so that writes can be recorded in the dirty log
+ * (e.g. if not using PML). SPTEs are write-protected for dirty logging
+ * during the VM-iotcls that enable dirty logging.
+ *
+ * 2. To intercept writes to guest page tables that KVM is shadowing. When a
+ * guest writes to its page table the corresponding shadow page table will
+ * be marked "unsync". That way KVM knows which shadow page tables need to
+ * be updated on the next TLB flush, INVLPG, etc. and which do not.
+ *
+ * 3. To prevent guest writes to read-only memory, such as for memory in a
+ * read-only memslot or guest memory backed by a read-only VMA. Writes to
+ * such pages are disallowed entirely.
+ *
+ * To keep track of why a given SPTE is write-protected, KVM uses 2
+ * software-only bits in the SPTE:
+ *
+ * shadow_mmu_writable_mask, aka MMU-writable -
+ * Cleared on SPTEs that KVM is currently write-protecting for shadow paging
+ * purposes (case 2 above).
+ *
+ * shadow_host_writable_mask, aka Host-writable -
+ * Cleared on SPTEs that are not host-writable (case 3 above)
+ *
+ * Note, not all possible combinations of PT_WRITABLE_MASK,
+ * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given
+ * SPTE can be in only one of the following states, which map to the
+ * aforementioned 3 cases:
+ *
+ * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK
+ * ------------------------- | ------------------------ | ----------------
+ * 1 | 1 | 1 (writable)
+ * 1 | 1 | 0 (case 1)
+ * 1 | 0 | 0 (case 2)
+ * 0 | 0 | 0 (case 3)
+ *
+ * The valid combinations of these bits are checked by
+ * check_spte_writable_invariants() whenever an SPTE is modified.
+ *
+ * Clearing the MMU-writable bit is always done under the MMU lock and always
+ * accompanied by a TLB flush before dropping the lock to avoid corrupting the
+ * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging
+ * (which does not clear the MMU-writable bit), does not flush TLBs before
+ * dropping the lock, as it only needs to synchronize guest writes with the
+ * dirty bitmap.
+ *
+ * So, there is the problem: clearing the MMU-writable bit can encounter a
+ * write-protected SPTE while CPUs still have writable mappings for that SPTE
+ * cached in their TLB. To address this, KVM always flushes TLBs when
+ * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE.
+ *
+ * The Host-writable bit is not modified on present SPTEs, it is only set or
+ * cleared when an SPTE is first faulted in from non-present and then remains
+ * immutable.
+ */
+static inline bool is_writable_pte(unsigned long pte)
{
- if (spte & shadow_mmu_writable_mask) {
- WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
- return true;
- }
+ return pte & PT_WRITABLE_MASK;
+}
+
+/* Note: spte must be a shadow-present leaf SPTE. */
+static inline void check_spte_writable_invariants(u64 spte)
+{
+ if (spte & shadow_mmu_writable_mask)
+ WARN_ONCE(!(spte & shadow_host_writable_mask),
+ "kvm: MMU-writable SPTE is not Host-writable: %llx",
+ spte);
+ else
+ WARN_ONCE(is_writable_pte(spte),
+ "kvm: Writable SPTE is not MMU-writable: %llx", spte);
+}
- WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
- return false;
+static inline bool spte_can_locklessly_be_made_writable(u64 spte)
+{
+ return spte & shadow_mmu_writable_mask;
}
static inline u64 get_mmio_spte_generation(u64 spte)
@@ -364,9 +415,25 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
u64 old_spte, bool prefetch, bool can_unsync,
bool host_writable, u64 *new_spte);
+u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index);
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
u64 mark_spte_for_access_track(u64 spte);
+
+/* Restore an acc-track PTE back to a regular PTE */
+static inline u64 restore_acc_track_spte(u64 spte)
+{
+ u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+ & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
+
+ spte &= ~shadow_acc_track_mask;
+ spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
+ spte |= saved_bits;
+
+ return spte;
+}
+
u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn);
void kvm_mmu_reset_all_pte_masks(void);
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index caa96c270b95..be3f096db2eb 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -40,17 +40,19 @@ void tdp_iter_restart(struct tdp_iter *iter)
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
-void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn)
{
+ int root_level = root->role.level;
+
WARN_ON(root_level < 1);
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
iter->next_last_level_gfn = next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
- iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
- iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));
+ iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
+ iter->as_id = kvm_mmu_page_as_id(root);
tdp_iter_restart(iter);
}
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index e19cabbcb65c..216ebbe76ddd 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -57,17 +57,17 @@ struct tdp_iter {
* Iterates over every SPTE mapping the GFN range [start, end) in a
* preorder traversal.
*/
-#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \
- for (tdp_iter_start(&iter, root, root_level, min_level, start); \
+#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
+ for (tdp_iter_start(&iter, root, min_level, start); \
iter.valid && iter.gfn < end; \
tdp_iter_next(&iter))
-#define for_each_tdp_pte(iter, root, root_level, start, end) \
- for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end)
+#define for_each_tdp_pte(iter, root, start, end) \
+ for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)
tdp_ptep_t spte_to_child_pt(u64 pte, int level);
-void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index bc9e3553fba2..8def8f810cb0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -99,15 +99,18 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
}
/*
- * Finds the next valid root after root (or the first valid root if root
- * is NULL), takes a reference on it, and returns that next root. If root
- * is not NULL, this thread should have already taken a reference on it, and
- * that reference will be dropped. If no valid root is found, this
- * function will return NULL.
+ * Returns the next root after @prev_root (or the first root if @prev_root is
+ * NULL). A reference to the returned root is acquired, and the reference to
+ * @prev_root is released (the caller obviously must hold a reference to
+ * @prev_root if it's non-NULL).
+ *
+ * If @only_valid is true, invalid roots are skipped.
+ *
+ * Returns NULL if the end of tdp_mmu_roots was reached.
*/
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root,
- bool shared)
+ bool shared, bool only_valid)
{
struct kvm_mmu_page *next_root;
@@ -121,9 +124,14 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
typeof(*next_root), link);
- while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
+ while (next_root) {
+ if ((!only_valid || !next_root->role.invalid) &&
+ kvm_tdp_mmu_get_root(next_root))
+ break;
+
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
&next_root->link, typeof(*next_root), link);
+ }
rcu_read_unlock();
@@ -143,13 +151,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* mode. In the unlikely event that this thread must free a root, the lock
* will be temporarily dropped and reacquired in write mode.
*/
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
- for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
- _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _shared)) \
- if (kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
+ if (kvm_mmu_page_as_id(_root) != _as_id) { \
} else
+#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
+
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
+
#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
lockdep_is_held_type(&kvm->mmu_lock, 0) || \
@@ -157,57 +171,63 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
if (kvm_mmu_page_as_id(_root) != _as_id) { \
} else
-static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
- int level)
+static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
- union kvm_mmu_page_role role;
+ struct kvm_mmu_page *sp;
- role = vcpu->arch.mmu->mmu_role.base;
- role.level = level;
- role.direct = true;
- role.has_4_byte_gpte = false;
- role.access = ACC_ALL;
- role.ad_disabled = !shadow_accessed_mask;
+ sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
- return role;
+ return sp;
}
-static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- int level)
+static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, gfn_t gfn,
+ union kvm_mmu_page_role role)
{
- struct kvm_mmu_page *sp;
-
- sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
- sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
- sp->role.word = page_role_for_level(vcpu, level).word;
+ sp->role = role;
sp->gfn = gfn;
sp->tdp_mmu_page = true;
trace_kvm_mmu_get_page(sp, true);
+}
- return sp;
+static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
+ struct tdp_iter *iter)
+{
+ struct kvm_mmu_page *parent_sp;
+ union kvm_mmu_page_role role;
+
+ parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
+
+ role = parent_sp->role;
+ role.level--;
+
+ tdp_mmu_init_sp(child_sp, iter->gfn, role);
}
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
{
- union kvm_mmu_page_role role;
+ union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
- role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
-
- /* Check for an existing root before allocating a new one. */
+ /*
+ * Check for an existing root before allocating a new one. Note, the
+ * role check prevents consuming an invalid root.
+ */
for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
if (root->role.word == role.word &&
- kvm_tdp_mmu_get_root(kvm, root))
+ kvm_tdp_mmu_get_root(root))
goto out;
}
- root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
+ root = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_sp(root, 0, role);
+
refcount_set(&root->tdp_mmu_root_count, 1);
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
@@ -252,25 +272,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
}
/**
- * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
- *
- * @kvm: kvm instance
- * @sp: the new page
- * @account_nx: This page replaces a NX large page and should be marked for
- * eventual reclaim.
- */
-static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- bool account_nx)
-{
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
- list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
- if (account_nx)
- account_huge_nx_page(kvm, sp);
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-}
-
-/**
- * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
+ * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
*
* @kvm: kvm instance
* @sp: the page to be removed
@@ -278,8 +280,8 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
* the MMU lock and the operation must synchronize with other
* threads that might be adding or removing pages.
*/
-static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- bool shared)
+static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool shared)
{
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
@@ -295,7 +297,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
}
/**
- * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
+ * handle_removed_pt() - handle a page table removed from the TDP structure
*
* @kvm: kvm instance
* @pt: the page removed from the paging structure
@@ -311,8 +313,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
* this thread will be responsible for ensuring the page is freed. Hence the
* early rcu_dereferences in the function.
*/
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
- bool shared)
+static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
{
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
@@ -321,7 +322,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
trace_kvm_mmu_prepare_zap_page(sp);
- tdp_mmu_unlink_page(kvm, sp, shared);
+ tdp_mmu_unlink_sp(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
u64 *sptep = rcu_dereference(pt) + i;
@@ -435,6 +436,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+ if (is_leaf)
+ check_spte_writable_invariants(new_spte);
+
/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -472,8 +476,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* the paging structure.
*/
if (was_present && !was_leaf && (pfn_changed || !is_present))
- handle_removed_tdp_mmu_page(kvm,
- spte_to_child_pt(old_spte, level), shared);
+ handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -492,16 +495,25 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* and handle the associated bookkeeping. Do not mark the page dirty
* in KVM's dirty bitmaps.
*
+ * If setting the SPTE fails because it has changed, iter->old_spte will be
+ * refreshed to the current value of the spte.
+ *
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
* @new_spte: The value the SPTE should be set to
- * Returns: true if the SPTE was set, false if it was not. If false is returned,
- * this function will have no side-effects.
+ * Return:
+ * * 0 - If the SPTE was set.
+ * * -EBUSY - If the SPTE cannot be set. In this case this function will have
+ * no side-effects other than setting iter->old_spte to the last
+ * known value of the spte.
*/
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
+ u64 *sptep = rcu_dereference(iter->sptep);
+ u64 old_spte;
+
WARN_ON_ONCE(iter->yielded);
lockdep_assert_held_read(&kvm->mmu_lock);
@@ -511,34 +523,45 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
* may modify it.
*/
if (is_removed_spte(iter->old_spte))
- return false;
+ return -EBUSY;
/*
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
* does not hold the mmu_lock.
*/
- if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
- new_spte) != iter->old_spte)
- return false;
+ old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
+ if (old_spte != iter->old_spte) {
+ /*
+ * The page table entry was modified by a different logical
+ * CPU. Refresh iter->old_spte with the current value so the
+ * caller operates on fresh data, e.g. if it retries
+ * tdp_mmu_set_spte_atomic().
+ */
+ iter->old_spte = old_spte;
+ return -EBUSY;
+ }
__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
new_spte, iter->level, true);
handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
- return true;
+ return 0;
}
-static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter)
+static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter)
{
+ int ret;
+
/*
* Freeze the SPTE by setting it to a special,
* non-present value. This will stop other threads from
* immediately installing a present entry in its place
* before the TLBs are flushed.
*/
- if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
- return false;
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
+ if (ret)
+ return ret;
kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
KVM_PAGES_PER_HPAGE(iter->level));
@@ -553,7 +576,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
*/
WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
- return true;
+ return 0;
}
@@ -624,7 +647,7 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
}
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
- for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
+ for_each_tdp_pte(_iter, _root, _start, _end)
#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
tdp_root_for_each_pte(_iter, _root, _start, _end) \
@@ -634,8 +657,7 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
else
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
- for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
- _mmu->shadow_root_level, _start, _end)
+ for_each_tdp_pte(_iter, to_shadow_page(_mmu->root_hpa), _start, _end)
/*
* Yield if the MMU lock is contended or this thread needs to return control
@@ -724,8 +746,7 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
- min_level, start, end) {
+ for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
retry:
if (can_yield &&
tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
@@ -750,12 +771,7 @@ retry:
if (!shared) {
tdp_mmu_set_spte(kvm, &iter, 0);
flush = true;
- } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ } else if (tdp_mmu_zap_spte_atomic(kvm, &iter)) {
goto retry;
}
}
@@ -913,7 +929,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY;
/*
@@ -947,6 +963,44 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
}
/*
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @sp: The new TDP page table to install.
+ * @account_nx: True if this page table is being installed to split a
+ * non-executable huge page.
+ * @shared: This operation is running under the MMU lock in read mode.
+ *
+ * Returns: 0 if the new page table was installed. Non-0 if the page table
+ * could not be installed (e.g. the atomic compare-exchange failed).
+ */
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool account_nx,
+ bool shared)
+{
+ u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
+ int ret = 0;
+
+ if (shared) {
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
+ if (ret)
+ return ret;
+ } else {
+ tdp_mmu_set_spte(kvm, iter, spte);
+ }
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
+ if (account_nx)
+ account_huge_nx_page(kvm, sp);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+ return 0;
+}
+
+/*
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
* page tables and SPTEs to translate the faulting guest physical address.
*/
@@ -955,8 +1009,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
struct kvm_mmu *mmu = vcpu->arch.mmu;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
- u64 *child_pt;
- u64 new_spte;
int ret;
kvm_mmu_hugepage_adjust(vcpu, fault);
@@ -979,7 +1031,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
*/
if (is_shadow_present_pte(iter.old_spte) &&
is_large_pte(iter.old_spte)) {
- if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
+ if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
break;
/*
@@ -991,6 +1043,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
if (!is_shadow_present_pte(iter.old_spte)) {
+ bool account_nx = fault->huge_page_disallowed &&
+ fault->req_level >= iter.level;
+
/*
* If SPTE has been frozen by another thread, just
* give up and retry, avoiding unnecessary page table
@@ -999,19 +1054,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (is_removed_spte(iter.old_spte))
break;
- sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
- child_pt = sp->spt;
-
- new_spte = make_nonleaf_spte(child_pt,
- !shadow_accessed_mask);
+ sp = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_child_sp(sp, &iter);
- if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
- tdp_mmu_link_page(vcpu->kvm, sp,
- fault->huge_page_disallowed &&
- fault->req_level >= iter.level);
-
- trace_kvm_mmu_get_page(sp, true);
- } else {
+ if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
tdp_mmu_free_sp(sp);
break;
}
@@ -1032,13 +1078,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush)
{
- struct kvm_mmu_page *root;
-
- for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
- flush = zap_gfn_range(kvm, root, range->start, range->end,
- range->may_block, flush, false);
-
- return flush;
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
+ range->end, range->may_block, flush);
}
typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
@@ -1180,8 +1221,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
- for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
- min_level, start, end) {
+ for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
@@ -1193,14 +1233,9 @@ retry:
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
goto retry;
- }
+
spte_set = true;
}
@@ -1221,13 +1256,197 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level);
return spte_set;
}
+static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+{
+ struct kvm_mmu_page *sp;
+
+ gfp |= __GFP_ZERO;
+
+ sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
+ if (!sp)
+ return NULL;
+
+ sp->spt = (void *)__get_free_page(gfp);
+ if (!sp->spt) {
+ kmem_cache_free(mmu_page_header_cache, sp);
+ return NULL;
+ }
+
+ return sp;
+}
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool shared)
+{
+ struct kvm_mmu_page *sp;
+
+ /*
+ * Since we are allocating while under the MMU lock we have to be
+ * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
+ * reclaim and to avoid making any filesystem callbacks (which can end
+ * up invoking KVM MMU notifiers, resulting in a deadlock).
+ *
+ * If this allocation fails we drop the lock and retry with reclaim
+ * allowed.
+ */
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
+ if (sp)
+ return sp;
+
+ rcu_read_unlock();
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+
+ iter->yielded = true;
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ return sp;
+}
+
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
+{
+ const u64 huge_spte = iter->old_spte;
+ const int level = iter->level;
+ int ret, i;
+
+ tdp_mmu_init_child_sp(sp, iter);
+
+ /*
+ * No need for atomics when writing to sp->spt since the page table has
+ * not been linked in yet and thus is not reachable from any other CPU.
+ */
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++)
+ sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
+
+ /*
+ * Replace the huge spte with a pointer to the populated lower level
+ * page table. Since we are making this change without a TLB flush vCPUs
+ * will see a mix of the split mappings and the original huge mapping,
+ * depending on what's currently in their TLB. This is fine from a
+ * correctness standpoint since the translation will be the same either
+ * way.
+ */
+ ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
+ if (ret)
+ goto out;
+
+ /*
+ * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
+ * are overwriting from the page stats. But we have to manually update
+ * the page stats with the new present child pages.
+ */
+ kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
+
+out:
+ trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
+ return ret;
+}
+
+static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
+ struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct tdp_iter iter;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ /*
+ * Traverse the page table splitting all huge pages above the target
+ * level into one lower level. For example, if we encounter a 1GB page
+ * we split it into 512 2MB pages.
+ *
+ * Since the TDP iterator uses a pre-order traversal, we are guaranteed
+ * to visit an SPTE before ever visiting its children, which means we
+ * will correctly recursively split huge pages that are more than one
+ * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
+ * and then splitting each of those to 512 4KB pages).
+ */
+ for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
+ continue;
+
+ if (!sp) {
+ sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
+ if (!sp) {
+ ret = -ENOMEM;
+ trace_kvm_mmu_split_huge_page(iter.gfn,
+ iter.old_spte,
+ iter.level, ret);
+ break;
+ }
+
+ if (iter.yielded)
+ continue;
+ }
+
+ if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
+ goto retry;
+
+ sp = NULL;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * It's possible to exit the loop having never used the last sp if, for
+ * example, a vCPU doing HugePage NX splitting wins the race and
+ * installs its own sp in place of the last sp we tried to split.
+ */
+ if (sp)
+ tdp_mmu_free_sp(sp);
+
+ return ret;
+}
+
+
+/*
+ * Try to split all huge pages mapped by the TDP MMU down to the target level.
+ */
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *root;
+ int r = 0;
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
+ r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
+ if (r) {
+ kvm_tdp_mmu_put_root(kvm, root, shared);
+ break;
+ }
+ }
+}
+
/*
* Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
* AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
@@ -1261,14 +1480,9 @@ retry:
continue;
}
- if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
goto retry;
- }
+
spte_set = true;
}
@@ -1291,7 +1505,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
@@ -1392,14 +1606,8 @@ retry:
continue;
/* Note, a successful atomic zap also does a remote TLB flush. */
- if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ if (tdp_mmu_zap_spte_atomic(kvm, &iter))
goto retry;
- }
}
rcu_read_unlock();
@@ -1416,7 +1624,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
zap_collapsible_spte_range(kvm, root, slot);
}
@@ -1436,8 +1644,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
- min_level, gfn, gfn + 1) {
+ for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 3899004a5d91..3f987785702a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -7,12 +7,8 @@
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
- struct kvm_mmu_page *root)
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
- if (root->role.invalid)
- return false;
-
return refcount_inc_not_zero(&root->tdp_mmu_root_count);
}
@@ -71,6 +67,11 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
int min_level);
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared);
+
static inline void kvm_tdp_mmu_walk_lockless_begin(void)
{
rcu_read_lock();
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index f614f95acc6b..b1a02993782b 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -95,7 +95,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
}
static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
- unsigned config, bool exclude_user,
+ u64 config, bool exclude_user,
bool exclude_kernel, bool intr,
bool in_tx, bool in_tx_cp)
{
@@ -181,7 +181,8 @@ static int cmp_u64(const void *a, const void *b)
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{
- unsigned config, type = PERF_TYPE_RAW;
+ u64 config;
+ u32 type = PERF_TYPE_RAW;
struct kvm *kvm = pmc->vcpu->kvm;
struct kvm_pmu_event_filter *filter;
bool allow_event = true;
@@ -220,7 +221,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
}
if (type == PERF_TYPE_RAW)
- config = eventsel & X86_RAW_EVENT_MASK;
+ config = eventsel & AMD64_RAW_EVENT_MASK;
if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
return;
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 90364d02f22a..d4fa8c4f3a9a 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -27,20 +27,6 @@
#include "irq.h"
#include "svm.h"
-#define SVM_AVIC_DOORBELL 0xc001011b
-
-#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-
-/*
- * 0xff is broadcast, so the max index allowed for physical APIC ID
- * table is 0xfe. APIC IDs above 0xff are reserved.
- */
-#define AVIC_MAX_PHYSICAL_ID_COUNT 255
-
-#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
-#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
-#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
-
/* AVIC GATAG is encoded using VM and VCPU IDs */
#define AVIC_VCPU_ID_BITS 8
#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
@@ -73,12 +59,6 @@ struct amd_svm_iommu_ir {
void *data; /* Storing pointer to struct amd_ir_data */
};
-enum avic_ipi_failure_cause {
- AVIC_IPI_FAILURE_INVALID_INT_TYPE,
- AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
- AVIC_IPI_FAILURE_INVALID_TARGET,
- AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
-};
/* Note:
* This function is called from IOMMU driver to notify
@@ -289,6 +269,22 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
+void avic_ring_doorbell(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, the vCPU could get migrated to a different pCPU at any point,
+ * which could result in signalling the wrong/previous pCPU. But if
+ * that happens the vCPU is guaranteed to do a VMRUN (after being
+ * migrated) and thus will process pending interrupts, i.e. a doorbell
+ * is not needed (and the spurious one is harmless).
+ */
+ int cpu = READ_ONCE(vcpu->cpu);
+
+ if (cpu != get_cpu())
+ wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
+ put_cpu();
+}
+
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh)
{
@@ -304,8 +300,13 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
kvm_for_each_vcpu(i, vcpu, kvm) {
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
GET_APIC_DEST_FIELD(icrh),
- icrl & APIC_DEST_MASK))
- kvm_vcpu_wake_up(vcpu);
+ icrl & APIC_DEST_MASK)) {
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ }
}
}
@@ -345,8 +346,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
break;
case AVIC_IPI_FAILURE_INVALID_TARGET:
- WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
- index, vcpu->vcpu_id, icrh, icrl);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
@@ -579,7 +578,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
return ret;
}
-void avic_post_state_restore(struct kvm_vcpu *vcpu)
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
{
if (avic_handle_apic_id_update(vcpu) != 0)
return;
@@ -587,20 +586,7 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
-{
- return;
-}
-
-void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
-}
-
-void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
-{
-}
-
-static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
{
int ret = 0;
unsigned long flags;
@@ -632,7 +618,7 @@ out:
return ret;
}
-void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -649,7 +635,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
* we need to check and update the AVIC logical APIC ID table
* accordingly before re-activating.
*/
- avic_post_state_restore(vcpu);
+ avic_apicv_post_state_restore(vcpu);
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
} else {
vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
@@ -661,63 +647,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
else
avic_vcpu_put(vcpu);
- svm_set_pi_irte_mode(vcpu, activated);
-}
-
-void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
-{
- return;
-}
-
-int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
-{
- if (!vcpu->arch.apicv_active)
- return -1;
-
- kvm_lapic_set_irr(vec, vcpu->arch.apic);
-
- /*
- * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
- * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
- * the read of guest_mode, which guarantees that either VMRUN will see
- * and process the new vIRR entry, or that the below code will signal
- * the doorbell if the vCPU is already running in the guest.
- */
- smp_mb__after_atomic();
-
- /*
- * Signal the doorbell to tell hardware to inject the IRQ if the vCPU
- * is in the guest. If the vCPU is not in the guest, hardware will
- * automatically process AVIC interrupts at VMRUN.
- */
- if (vcpu->mode == IN_GUEST_MODE) {
- int cpu = READ_ONCE(vcpu->cpu);
-
- /*
- * Note, the vCPU could get migrated to a different pCPU at any
- * point, which could result in signalling the wrong/previous
- * pCPU. But if that happens the vCPU is guaranteed to do a
- * VMRUN (after being migrated) and thus will process pending
- * interrupts, i.e. a doorbell is not needed (and the spurious
- * one is harmless).
- */
- if (cpu != get_cpu())
- wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
- put_cpu();
- } else {
- /*
- * Wake the vCPU if it was blocking. KVM will then detect the
- * pending IRQ when checking if the vCPU has a wake event.
- */
- kvm_vcpu_wake_up(vcpu);
- }
-
- return 0;
-}
-
-bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
- return false;
+ avic_set_pi_irte_mode(vcpu, activated);
}
static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
@@ -817,7 +747,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
}
/*
- * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ * avic_pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
@@ -825,8 +755,8 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
-int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
@@ -926,7 +856,7 @@ out:
return ret;
}
-bool svm_check_apicv_inhibit_reasons(ulong bit)
+bool avic_check_apicv_inhibit_reasons(ulong bit)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
BIT(APICV_INHIBIT_REASON_ABSENT) |
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
new file mode 100644
index 000000000000..7d6d97968fb9
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM).
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__
+#define __ARCH_X86_KVM_SVM_HYPERV_H__
+
+#include <asm/mshyperv.h>
+
+#include "../hyperv.h"
+
+/*
+ * Hyper-V uses the software reserved 32 bytes in VMCB
+ * control area to expose SVM enlightenments to guests.
+ */
+struct hv_enlightenments {
+ struct __packed hv_enlightenments_control {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 enlightened_npt_tlb: 1;
+ u32 reserved:29;
+ } __packed hv_enlightenments_control;
+ u32 hv_vp_id;
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 reserved;
+} __packed;
+
+/*
+ * Hyper-V uses the software reserved clean bit in VMCB
+ */
+#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW
+
+#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 1218b5a342fc..f284e61451c8 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -28,6 +28,7 @@
#include "cpuid.h"
#include "lapic.h"
#include "svm.h"
+#include "hyperv.h"
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
@@ -165,14 +166,30 @@ void recalc_intercepts(struct vcpu_svm *svm)
vmcb_set_intercept(c, INTERCEPT_VMSAVE);
}
+/*
+ * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
+ * is optimized in that it only merges the parts where KVM MSR permission bitmap
+ * may contain zero bits.
+ */
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
+ struct hv_enlightenments *hve =
+ (struct hv_enlightenments *)svm->nested.ctl.reserved_sw;
+ int i;
+
/*
- * This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is optimized in that it only merges the parts where
- * the kvm msr permission bitmap may contain zero bits
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) is using Hyper-V emulation interface and
+ * tells KVM (L0) there were no changes in MSR bitmap for L2.
*/
- int i;
+ if (!svm->nested.force_msr_bitmap_recalc &&
+ kvm_hv_hypercall_enabled(&svm->vcpu) &&
+ hve->hv_enlightenments_control.msr_bitmap &&
+ (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS)))
+ goto set_msrpm_base_pa;
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
@@ -193,6 +210,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
svm->nested.msrpm[p] = svm->msrpm[p] | value;
}
+ svm->nested.force_msr_bitmap_recalc = false;
+
+set_msrpm_base_pa:
svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
return true;
@@ -298,7 +318,8 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
}
static
-void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to,
+void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *to,
struct vmcb_control_area *from)
{
unsigned int i;
@@ -331,12 +352,19 @@ void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to,
to->asid = from->asid;
to->msrpm_base_pa &= ~0x0fffULL;
to->iopm_base_pa &= ~0x0fffULL;
+
+ /* Hyper-V extensions (Enlightened VMCB) */
+ if (kvm_hv_hypercall_enabled(vcpu)) {
+ to->clean = from->clean;
+ memcpy(to->reserved_sw, from->reserved_sw,
+ sizeof(struct hv_enlightenments));
+ }
}
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
struct vmcb_control_area *control)
{
- __nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control);
+ __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control);
}
static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
@@ -494,6 +522,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
new_vmcb12 = true;
svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+ svm->nested.force_msr_bitmap_recalc = true;
}
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
@@ -1302,6 +1331,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
dst->virt_ext = from->virt_ext;
dst->pause_filter_count = from->pause_filter_count;
dst->pause_filter_thresh = from->pause_filter_thresh;
+ /* 'clean' and 'reserved_sw' are not changed by KVM */
}
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
@@ -1434,7 +1464,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
goto out_free;
ret = -EINVAL;
- __nested_copy_vmcb_control_to_cache(&ctl_cached, ctl);
+ __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
goto out_free;
@@ -1457,18 +1487,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
!__nested_vmcb_check_save(vcpu, &save_cached))
goto out_free;
- /*
- * While the nested guest CR3 is already checked and set by
- * KVM_SET_SREGS, it was set when nested state was yet loaded,
- * thus MMU might not be initialized correctly.
- * Set it again to fix this.
- */
-
- ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
- nested_npt_enabled(svm), false);
- if (WARN_ON_ONCE(ret))
- goto out_free;
-
/*
* All checks done, we can enter guest mode. Userspace provides
@@ -1494,6 +1512,21 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
svm_switch_vmcb(svm, &svm->nested.vmcb02);
nested_vmcb02_prepare_control(svm);
+
+ /*
+ * While the nested guest CR3 is already checked and set by
+ * KVM_SET_SREGS, it was set when nested state was yet loaded,
+ * thus MMU might not be initialized correctly.
+ * Set it again to fix this.
+ */
+
+ ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm), false);
+ if (WARN_ON_ONCE(ret))
+ goto out_free;
+
+ svm->nested.force_msr_bitmap_recalc = true;
+
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
ret = 0;
out_free:
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 17b53457d866..789b69294d28 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -258,6 +258,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free;
INIT_LIST_HEAD(&sev->regions_list);
+ INIT_LIST_HEAD(&sev->mirror_vms);
return 0;
@@ -1623,9 +1624,12 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
}
}
-static void sev_migrate_from(struct kvm_sev_info *dst,
- struct kvm_sev_info *src)
+static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
{
+ struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info;
+ struct kvm_sev_info *mirror;
+
dst->active = true;
dst->asid = src->asid;
dst->handle = src->handle;
@@ -1639,6 +1643,30 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
src->enc_context_owner = NULL;
list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+
+ /*
+ * If this VM has mirrors, "transfer" each mirror's refcount of the
+ * source to the destination (this KVM). The caller holds a reference
+ * to the source, so there's no danger of use-after-free.
+ */
+ list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
+ list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
+ kvm_get_kvm(dst_kvm);
+ kvm_put_kvm(src_kvm);
+ mirror->enc_context_owner = dst_kvm;
+ }
+
+ /*
+ * If this VM is a mirror, remove the old mirror from the owners list
+ * and add the new mirror to the list.
+ */
+ if (is_mirroring_enc_context(dst_kvm)) {
+ struct kvm_sev_info *owner_sev_info =
+ &to_kvm_svm(dst->enc_context_owner)->sev_info;
+
+ list_del(&src->mirror_entry);
+ list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
+ }
}
static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
@@ -1681,7 +1709,7 @@ static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
return 0;
}
-int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_info *src_sev, *cg_cleanup_sev;
@@ -1708,15 +1736,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
src_sev = &to_kvm_svm(source_kvm)->sev_info;
- /*
- * VMs mirroring src's encryption context rely on it to keep the
- * ASID allocated, but below we are clearing src_sev->asid.
- */
- if (src_sev->num_mirrored_vms) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1738,7 +1757,8 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
if (ret)
goto out_source_vcpu;
}
- sev_migrate_from(dst_sev, src_sev);
+
+ sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
ret = 0;
@@ -1761,7 +1781,7 @@ out_fput:
return ret;
}
-int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
int r;
@@ -1858,8 +1878,8 @@ out:
return r;
}
-int svm_register_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range)
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct enc_region *region;
@@ -1932,8 +1952,8 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
kfree(region);
}
-int svm_unregister_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range)
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
{
struct enc_region *region;
int ret;
@@ -1972,7 +1992,7 @@ failed:
return ret;
}
-int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
struct file *source_kvm_file;
struct kvm *source_kvm;
@@ -2008,10 +2028,10 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
*/
source_sev = &to_kvm_svm(source_kvm)->sev_info;
kvm_get_kvm(source_kvm);
- source_sev->num_mirrored_vms++;
+ mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
/* Set enc_context_owner and copy its encryption context over */
- mirror_sev = &to_kvm_svm(kvm)->sev_info;
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
mirror_sev->asid = source_sev->asid;
@@ -2019,6 +2039,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
mirror_sev->es_active = source_sev->es_active;
mirror_sev->handle = source_sev->handle;
INIT_LIST_HEAD(&mirror_sev->regions_list);
+ INIT_LIST_HEAD(&mirror_sev->mirror_vms);
ret = 0;
/*
@@ -2041,19 +2062,17 @@ void sev_vm_destroy(struct kvm *kvm)
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
- WARN_ON(sev->num_mirrored_vms);
-
if (!sev_guest(kvm))
return;
+ WARN_ON(!list_empty(&sev->mirror_vms));
+
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
if (is_mirroring_enc_context(kvm)) {
struct kvm *owner_kvm = sev->enc_context_owner;
- struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
mutex_lock(&owner_kvm->lock);
- if (!WARN_ON(!owner_sev->num_mirrored_vms))
- owner_sev->num_mirrored_vms--;
+ list_del(&sev->mirror_entry);
mutex_unlock(&owner_kvm->lock);
kvm_put_kvm(owner_kvm);
return;
@@ -2173,7 +2192,7 @@ out:
#endif
}
-void sev_hardware_teardown(void)
+void sev_hardware_unsetup(void)
{
if (!sev_enabled)
return;
@@ -2907,20 +2926,16 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
sev_enc_bit));
}
-void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu)
+void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- struct vmcb_save_area *hostsa;
-
/*
* As an SEV-ES guest, hardware will restore the host state on VMEXIT,
- * of which one step is to perform a VMLOAD. Since hardware does not
- * perform a VMSAVE on VMRUN, the host savearea must be updated.
+ * of which one step is to perform a VMLOAD. KVM performs the
+ * corresponding VMSAVE in svm_prepare_guest_switch for both
+ * traditional and SEV-ES guests.
*/
- vmsave(__sme_page_pa(sd->save_area));
/* XCR0 is restored on VMEXIT, save the current host value */
- hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
/* PKRU is restored on VMEXIT, save the current host value */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a290efb272ad..7038c76fa841 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -263,7 +263,7 @@ u32 svm_msrpm_offset(u32 msr)
return MSR_INVALID;
}
-#define MAX_INST_SIZE 15
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
static int get_npt_level(void)
{
@@ -353,7 +353,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
}
-static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -401,7 +401,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
* raises a fault that is not intercepted. Still better than
* failing in all cases.
*/
- (void)skip_emulated_instruction(vcpu);
+ (void)svm_skip_emulated_instruction(vcpu);
rip = kvm_rip_read(vcpu);
svm->int3_rip = rip + svm->vmcb->save.cs.base;
svm->int3_injected = rip - old_rip;
@@ -668,6 +668,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
u32 msr, int read, int write)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u8 bit_read, bit_write;
unsigned long tmp;
u32 offset;
@@ -698,7 +699,7 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
msrpm[offset] = tmp;
svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
-
+ svm->nested.force_msr_bitmap_recalc = true;
}
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
@@ -873,11 +874,11 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-static void svm_hardware_teardown(void)
+static void svm_hardware_unsetup(void)
{
int cpu;
- sev_hardware_teardown();
+ sev_hardware_unsetup();
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
@@ -1175,7 +1176,7 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
svm->vmcb = target_vmcb->ptr;
}
-static int svm_create_vcpu(struct kvm_vcpu *vcpu)
+static int svm_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
struct page *vmcb01_page;
@@ -1246,7 +1247,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb)
cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
}
-static void svm_free_vcpu(struct kvm_vcpu *vcpu)
+static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1265,7 +1266,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
@@ -1280,10 +1281,12 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
* Save additional host state that will be restored on VMEXIT (sev-es)
* or subsequent vmload of host save area.
*/
+ vmsave(__sme_page_pa(sd->save_area));
if (sev_es_guest(vcpu->kvm)) {
- sev_es_prepare_guest_switch(svm, vcpu->cpu);
- } else {
- vmsave(__sme_page_pa(sd->save_area));
+ struct vmcb_save_area *hostsa;
+ hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
+
+ sev_es_prepare_switch_to_guest(hostsa);
}
if (tsc_scaling) {
@@ -1529,6 +1532,15 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
return save->cpl;
}
+static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+
static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1563,7 +1575,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
-static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1585,6 +1597,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 hcr0 = cr0;
+ bool old_paging = is_paging(vcpu);
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
@@ -1601,8 +1614,11 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
#endif
vcpu->arch.cr0 = cr0;
- if (!npt_enabled)
+ if (!npt_enabled) {
hcr0 |= X86_CR0_PG | X86_CR0_WP;
+ if (old_paging != is_paging(vcpu))
+ svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
/*
* re-enable caching here because the QEMU bios
@@ -1643,11 +1659,15 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long old_cr4 = vcpu->arch.cr4;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- svm_flush_tlb(vcpu);
+ svm_flush_tlb_current(vcpu);
vcpu->arch.cr4 = cr4;
- if (!npt_enabled)
+ if (!npt_enabled) {
cr4 |= X86_CR4_PAE;
+
+ if (!is_paging(vcpu))
+ cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
@@ -2261,7 +2281,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu)
int_type == SVM_EXITINTINFO_TYPE_SOFT ||
(int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
- if (!skip_emulated_instruction(vcpu))
+ if (!svm_skip_emulated_instruction(vcpu))
return 0;
}
@@ -3126,7 +3146,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
"excp_to:", save->last_excp_to);
}
-static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code)
+static bool svm_check_exit_valid(u64 exit_code)
{
return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
svm_exit_handlers[exit_code]);
@@ -3146,7 +3166,7 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
{
- if (!svm_check_exit_valid(vcpu, exit_code))
+ if (!svm_check_exit_valid(exit_code))
return svm_handle_invalid_exit(vcpu, exit_code);
#ifdef CONFIG_RETPOLINE
@@ -3181,7 +3201,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
*error_code = 0;
}
-static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
@@ -3278,7 +3298,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
++vcpu->stat.nmi_injections;
}
-static void svm_set_irq(struct kvm_vcpu *vcpu)
+static void svm_inject_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3291,21 +3311,55 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
-static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
- int trig_mode, int vector)
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vector)
{
- struct kvm_vcpu *vcpu = apic->vcpu;
+ /*
+ * vcpu->arch.apicv_active must be read after vcpu->mode.
+ * Pairs with smp_store_release in vcpu_enter_guest.
+ */
+ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
- if (svm_deliver_avic_intr(vcpu, vector)) {
- kvm_lapic_set_irr(vector, apic);
+ if (!READ_ONCE(vcpu->arch.apicv_active)) {
+ /* Process the interrupt via inject_pending_event */
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
+ return;
+ }
+
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
+ if (in_guest_mode) {
+ /*
+ * Signal the doorbell to tell hardware to inject the IRQ. If
+ * the vCPU exits the guest before the doorbell chimes, hardware
+ * will automatically process AVIC interrupts at the next VMRUN.
+ */
+ avic_ring_doorbell(vcpu);
} else {
- trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
- trig_mode, vector);
+ /*
+ * Wake the vCPU if it was blocking. KVM will then detect the
+ * pending IRQ when checking if the vCPU has a wake event.
+ */
+ kvm_vcpu_wake_up(vcpu);
}
}
+static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ kvm_lapic_set_irr(vector, apic);
+
+ /*
+ * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
+ * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
+ * the read of guest_mode. This guarantees that either VMRUN will see
+ * and process the new vIRR entry, or that svm_complete_interrupt_delivery
+ * will signal the doorbell if the CPU has already entered the guest.
+ */
+ smp_mb__after_atomic();
+ svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
+}
+
static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3353,11 +3407,13 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_nmi_blocked(vcpu))
+ return 0;
+
/* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
return -EBUSY;
-
- return !svm_nmi_blocked(vcpu);
+ return 1;
}
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -3409,9 +3465,13 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
struct vcpu_svm *svm = to_svm(vcpu);
+
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_interrupt_blocked(vcpu))
+ return 0;
+
/*
* An IRQ must not be injected into L2 if it's supposed to VM-Exit,
* e.g. if the IRQ arrived asynchronously after checking nested events.
@@ -3419,7 +3479,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
return -EBUSY;
- return !svm_interrupt_blocked(vcpu);
+ return 1;
}
static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
@@ -3468,17 +3528,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}
-static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
- return 0;
-}
-
-static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
-{
- return 0;
-}
-
-void svm_flush_tlb(struct kvm_vcpu *vcpu)
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3852,11 +3902,6 @@ static int __init svm_check_processor_compat(void)
return 0;
}
-static bool svm_cpu_has_accelerated_tpr(void)
-{
- return false;
-}
-
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
@@ -3879,11 +3924,6 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
return true;
}
-static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
-{
- return 0;
-}
-
static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4150,11 +4190,14 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_smi_blocked(vcpu))
+ return 0;
+
/* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
return -EBUSY;
- return !svm_smi_blocked(vcpu);
+ return 1;
}
static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
@@ -4188,7 +4231,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
* by 0x400 (matches the offset of 'struct vmcb_save_area'
* within 'struct vmcb'). Note: HSAVE area may also be used by
* L1 hypervisor to save additional host context (e.g. KVM does
- * that, see svm_prepare_guest_switch()) which must be
+ * that, see svm_prepare_switch_to_guest()) which must be
* preserved.
*/
if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
@@ -4248,11 +4291,18 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
* Enter the nested guest now
*/
+ vmcb_mark_all_dirty(svm->vmcb01.ptr);
+
vmcb12 = map.hva;
nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
+ if (ret)
+ goto unmap_save;
+
+ svm->nested.nested_run_pending = 1;
+
unmap_save:
kvm_vcpu_unmap(vcpu, &map_save, true);
unmap_map:
@@ -4456,21 +4506,20 @@ static int svm_vm_init(struct kvm *kvm)
static struct kvm_x86_ops svm_x86_ops __initdata = {
.name = "kvm_amd",
- .hardware_unsetup = svm_hardware_teardown,
+ .hardware_unsetup = svm_hardware_unsetup,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
- .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
.has_emulated_msr = svm_has_emulated_msr,
- .vcpu_create = svm_create_vcpu,
- .vcpu_free = svm_free_vcpu,
+ .vcpu_create = svm_vcpu_create,
+ .vcpu_free = svm_vcpu_free,
.vcpu_reset = svm_vcpu_reset,
.vm_size = sizeof(struct kvm_svm),
.vm_init = svm_vm_init,
.vm_destroy = svm_vm_destroy,
- .prepare_guest_switch = svm_prepare_guest_switch,
+ .prepare_switch_to_guest = svm_prepare_switch_to_guest,
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
.vcpu_blocking = avic_vcpu_blocking,
@@ -4484,9 +4533,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.get_segment = svm_get_segment,
.set_segment = svm_set_segment,
.get_cpl = svm_get_cpl,
- .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+ .get_cs_db_l_bits = svm_get_cs_db_l_bits,
.set_cr0 = svm_set_cr0,
- .post_set_cr3 = svm_post_set_cr3,
+ .post_set_cr3 = sev_post_set_cr3,
.is_valid_cr4 = svm_is_valid_cr4,
.set_cr4 = svm_set_cr4,
.set_efer = svm_set_efer,
@@ -4501,21 +4550,21 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_rflags = svm_set_rflags,
.get_if_flag = svm_get_if_flag,
- .tlb_flush_all = svm_flush_tlb,
- .tlb_flush_current = svm_flush_tlb,
- .tlb_flush_gva = svm_flush_tlb_gva,
- .tlb_flush_guest = svm_flush_tlb,
+ .flush_tlb_all = svm_flush_tlb_current,
+ .flush_tlb_current = svm_flush_tlb_current,
+ .flush_tlb_gva = svm_flush_tlb_gva,
+ .flush_tlb_guest = svm_flush_tlb_current,
.vcpu_pre_run = svm_vcpu_pre_run,
- .run = svm_vcpu_run,
- .handle_exit = handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
+ .vcpu_run = svm_vcpu_run,
+ .handle_exit = svm_handle_exit,
+ .skip_emulated_instruction = svm_skip_emulated_instruction,
.update_emulated_instruction = NULL,
.set_interrupt_shadow = svm_set_interrupt_shadow,
.get_interrupt_shadow = svm_get_interrupt_shadow,
.patch_hypercall = svm_patch_hypercall,
- .set_irq = svm_set_irq,
- .set_nmi = svm_inject_nmi,
+ .inject_irq = svm_inject_irq,
+ .inject_nmi = svm_inject_nmi,
.queue_exception = svm_queue_exception,
.cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
@@ -4525,17 +4574,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
- .set_virtual_apic_mode = svm_set_virtual_apic_mode,
- .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
- .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
- .load_eoi_exitmap = svm_load_eoi_exitmap,
- .hwapic_irr_update = svm_hwapic_irr_update,
- .hwapic_isr_update = svm_hwapic_isr_update,
- .apicv_post_state_restore = avic_post_state_restore,
-
- .set_tss_addr = svm_set_tss_addr,
- .set_identity_map_addr = svm_set_identity_map_addr,
- .get_mt_mask = svm_get_mt_mask,
+ .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
+ .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
+ .apicv_post_state_restore = avic_apicv_post_state_restore,
.get_exit_info = svm_get_exit_info,
@@ -4561,8 +4602,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.nested_ops = &svm_nested_ops,
.deliver_interrupt = svm_deliver_interrupt,
- .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
- .update_pi_irte = svm_update_pi_irte,
+ .pi_update_irte = avic_pi_update_irte,
.setup_mce = svm_setup_mce,
.smi_allowed = svm_smi_allowed,
@@ -4570,12 +4610,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.leave_smm = svm_leave_smm,
.enable_smi_window = svm_enable_smi_window,
- .mem_enc_op = svm_mem_enc_op,
- .mem_enc_reg_region = svm_register_enc_region,
- .mem_enc_unreg_region = svm_unregister_enc_region,
+ .mem_enc_ioctl = sev_mem_enc_ioctl,
+ .mem_enc_register_region = sev_mem_enc_register_region,
+ .mem_enc_unregister_region = sev_mem_enc_unregister_region,
- .vm_copy_enc_context_from = svm_vm_copy_asid_from,
- .vm_move_enc_context_from = svm_vm_migrate_from,
+ .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
+ .vm_move_enc_context_from = sev_vm_move_enc_context_from,
.can_emulate_instruction = svm_can_emulate_instruction,
@@ -4637,6 +4677,7 @@ static __init void svm_set_cpu_caps(void)
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
if (nested) {
kvm_cpu_cap_set(X86_FEATURE_SVM);
+ kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
if (nrips)
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
@@ -4819,7 +4860,7 @@ static __init int svm_hardware_setup(void)
return 0;
err:
- svm_hardware_teardown();
+ svm_hardware_unsetup();
return r;
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 73525353e424..70850cbe5bcb 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -79,7 +79,8 @@ struct kvm_sev_info {
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
- unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
+ struct list_head mirror_vms; /* List of VMs mirroring */
+ struct list_head mirror_entry; /* Use as a list entry of mirrors */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
};
@@ -137,6 +138,8 @@ struct vmcb_ctrl_area_cached {
u32 event_inj_err;
u64 nested_cr3;
u64 virt_ext;
+ u32 clean;
+ u8 reserved_sw[32];
};
struct svm_nested_state {
@@ -163,6 +166,15 @@ struct svm_nested_state {
struct vmcb_save_area_cached save;
bool initialized;
+
+ /*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
};
struct vcpu_sev_es_state {
@@ -321,7 +333,7 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
/*
* Only the PDPTRs are loaded on demand into the shadow MMU. All other
- * fields are synchronized in handle_exit, because accessing the VMCB is cheap.
+ * fields are synchronized on VM-Exit, because accessing the VMCB is cheap.
*
* CR3 might be out of date in the VMCB but it is not marked dirty; instead,
* KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3
@@ -480,7 +492,6 @@ void svm_vcpu_free_msrpm(u32 *msrpm);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void svm_flush_tlb(struct kvm_vcpu *vcpu);
void disable_nmi_singlestep(struct vcpu_svm *svm);
bool svm_smi_blocked(struct kvm_vcpu *vcpu);
bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
@@ -489,6 +500,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value);
int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
int read, int write);
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vec);
/* nested.c */
@@ -556,17 +569,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
-#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
-#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
-#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
-#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
-#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
-#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
-
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
@@ -576,19 +578,18 @@ int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
-void avic_post_state_restore(struct kvm_vcpu *vcpu);
-void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
-void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-bool svm_check_apicv_inhibit_reasons(ulong bit);
-void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
-void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
-void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
-int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
-bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
+void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+bool avic_check_apicv_inhibit_reasons(ulong bit);
+void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
+void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
+bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_ring_doorbell(struct kvm_vcpu *vcpu);
/* sev.c */
@@ -599,17 +600,17 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
extern unsigned int max_sev_asid;
void sev_vm_destroy(struct kvm *kvm);
-int svm_mem_enc_op(struct kvm *kvm, void __user *argp);
-int svm_register_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range);
-int svm_unregister_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range);
-int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd);
-int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd);
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp);
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
void pre_sev_run(struct vcpu_svm *svm, int cpu);
void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);
-void sev_hardware_teardown(void);
+void sev_hardware_unsetup(void);
int sev_cpu_init(struct svm_cpu_data *sd);
void sev_free_vcpu(struct kvm_vcpu *vcpu);
int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
@@ -617,7 +618,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_vcpu_reset(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
-void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
+void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index 489ca56212c6..e2fc59380465 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -7,35 +7,12 @@
#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
#if IS_ENABLED(CONFIG_HYPERV)
-#include <asm/mshyperv.h>
-#include "hyperv.h"
#include "kvm_onhyperv.h"
+#include "svm/hyperv.h"
static struct kvm_x86_ops svm_x86_ops;
-/*
- * Hyper-V uses the software reserved 32 bytes in VMCB
- * control area to expose SVM enlightenments to guests.
- */
-struct hv_enlightenments {
- struct __packed hv_enlightenments_control {
- u32 nested_flush_hypercall:1;
- u32 msr_bitmap:1;
- u32 enlightened_npt_tlb: 1;
- u32 reserved:29;
- } __packed hv_enlightenments_control;
- u32 hv_vp_id;
- u64 hv_vm_id;
- u64 partition_assist_page;
- u64 reserved;
-} __packed;
-
-/*
- * Hyper-V uses the software reserved clean bit in VMCB
- */
-#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW
-
int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu);
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 92e6f6702f00..e5a8c271b42d 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -64,9 +64,9 @@ TRACE_EVENT(kvm_hypercall,
* Tracepoint for hypercall.
*/
TRACE_EVENT(kvm_hv_hypercall,
- TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
- __u64 ingpa, __u64 outgpa),
- TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
+ TP_PROTO(__u16 code, bool fast, __u16 var_cnt, __u16 rep_cnt,
+ __u16 rep_idx, __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, var_cnt, rep_cnt, rep_idx, ingpa, outgpa),
TP_STRUCT__entry(
__field( __u16, rep_cnt )
@@ -74,6 +74,7 @@ TRACE_EVENT(kvm_hv_hypercall,
__field( __u64, ingpa )
__field( __u64, outgpa )
__field( __u16, code )
+ __field( __u16, var_cnt )
__field( bool, fast )
),
@@ -83,13 +84,14 @@ TRACE_EVENT(kvm_hv_hypercall,
__entry->ingpa = ingpa;
__entry->outgpa = outgpa;
__entry->code = code;
+ __entry->var_cnt = var_cnt;
__entry->fast = fast;
),
- TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ TP_printk("code 0x%x %s var_cnt 0x%x rep_cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
__entry->code, __entry->fast ? "fast" : "slow",
- __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
- __entry->outgpa)
+ __entry->var_cnt, __entry->rep_cnt, __entry->rep_idx,
+ __entry->ingpa, __entry->outgpa)
);
TRACE_EVENT(kvm_hv_hypercall_done,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ba34e94049c7..c73e4d938ddc 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4797,7 +4797,8 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
return 0;
}
-void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
+ bool vcpu_has_perf_global_ctrl)
{
struct vcpu_vmx *vmx;
@@ -4805,7 +4806,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
return;
vmx = to_vmx(vcpu);
- if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
+ if (vcpu_has_perf_global_ctrl) {
vmx->nested.msrs.entry_ctls_high |=
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
vmx->nested.msrs.exit_ctls_high |=
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index b69a80f43b37..c92cea0b8ccc 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -32,7 +32,8 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
-void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
+void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
+ bool vcpu_has_perf_global_ctrl);
void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
int size);
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 466d18fc0c5d..03fab48b149c 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -541,7 +541,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
bitmap_set(pmu->all_valid_pmc_idx,
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
- nested_vmx_pmu_entry_exit_ctls_update(vcpu);
+ nested_vmx_pmu_refresh(vcpu,
+ intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL));
if (intel_pmu_lbr_is_compatible(vcpu))
x86_perf_get_lbr(&lbr_desc->records);
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index aa1fe9085d77..3834bb30ce54 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -244,7 +244,7 @@ void vmx_pi_start_assignment(struct kvm *kvm)
}
/*
- * pi_update_irte - set IRTE for Posted-Interrupts
+ * vmx_pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
@@ -252,8 +252,8 @@ void vmx_pi_start_assignment(struct kvm *kvm)
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
-int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
- bool set)
+int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index eb14e76b84ef..9a45d5c9f116 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -97,8 +97,8 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
- bool set);
+int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
void vmx_pi_start_assignment(struct kvm *kvm);
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6c27bd0c89e1..d8547144d3b7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -541,11 +541,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static inline bool report_flexpriority(void)
-{
- return flexpriority_enabled;
-}
-
static int possible_passthrough_msr_slot(u32 msr)
{
u32 i;
@@ -2341,7 +2336,7 @@ fault:
return -EFAULT;
}
-static int hardware_enable(void)
+static int vmx_hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2382,7 +2377,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-static void hardware_disable(void)
+static void vmx_hardware_disable(void)
{
vmclear_local_loaded_vmcss();
@@ -3937,31 +3932,33 @@ static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
#ifdef CONFIG_SMP
if (vcpu->mode == IN_GUEST_MODE) {
/*
- * The vector of interrupt to be delivered to vcpu had
- * been set in PIR before this function.
+ * The vector of the virtual has already been set in the PIR.
+ * Send a notification event to deliver the virtual interrupt
+ * unless the vCPU is the currently running vCPU, i.e. the
+ * event is being sent from a fastpath VM-Exit handler, in
+ * which case the PIR will be synced to the vIRR before
+ * re-entering the guest.
*
- * Following cases will be reached in this block, and
- * we always send a notification event in all cases as
- * explained below.
+ * When the target is not the running vCPU, the following
+ * possibilities emerge:
*
- * Case 1: vcpu keeps in non-root mode. Sending a
- * notification event posts the interrupt to vcpu.
+ * Case 1: vCPU stays in non-root mode. Sending a notification
+ * event posts the interrupt to the vCPU.
*
- * Case 2: vcpu exits to root mode and is still
- * runnable. PIR will be synced to vIRR before the
- * next vcpu entry. Sending a notification event in
- * this case has no effect, as vcpu is not in root
- * mode.
+ * Case 2: vCPU exits to root mode and is still runnable. The
+ * PIR will be synced to the vIRR before re-entering the guest.
+ * Sending a notification event is ok as the host IRQ handler
+ * will ignore the spurious event.
*
- * Case 3: vcpu exits to root mode and is blocked.
- * vcpu_block() has already synced PIR to vIRR and
- * never blocks vcpu if vIRR is not cleared. Therefore,
- * a blocked vcpu here does not wait for any requested
- * interrupts in PIR, and sending a notification event
- * which has no effect is safe here.
+ * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
+ * has already synced PIR to vIRR and never blocks the vCPU if
+ * the vIRR is not empty. Therefore, a blocked vCPU here does
+ * not wait for any requested interrupts in PIR, and sending a
+ * notification event also results in a benign, spurious event.
*/
- apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+ if (vcpu != kvm_get_running_vcpu())
+ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
return;
}
#endif
@@ -5184,7 +5181,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (!kvm_require_dr(vcpu, dr))
return 1;
- if (kvm_x86_ops.get_cpl(vcpu) > 0)
+ if (vmx_get_cpl(vcpu) > 0)
goto out;
dr7 = vmcs_readl(GUEST_DR7);
@@ -6969,7 +6966,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
return vmx_exit_handlers_fastpath(vcpu);
}
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6980,7 +6977,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
free_loaded_vmcs(vmx->loaded_vmcs);
}
-static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
+static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
@@ -7342,11 +7339,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_secondary_exec_control(vmx));
if (nested_vmx_allowed(vcpu))
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
else
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ vmx->msr_ia32_feature_control_valid_bits &=
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
@@ -7659,6 +7656,7 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
if (ret)
return ret;
+ vmx->nested.nested_run_pending = 1;
vmx->nested.smm.guest_mode = false;
}
return 0;
@@ -7684,7 +7682,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
}
}
-static void hardware_unsetup(void)
+static void vmx_hardware_unsetup(void)
{
kvm_set_posted_intr_wakeup_handler(NULL);
@@ -7707,21 +7705,20 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.name = "kvm_intel",
- .hardware_unsetup = hardware_unsetup,
+ .hardware_unsetup = vmx_hardware_unsetup,
- .hardware_enable = hardware_enable,
- .hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = report_flexpriority,
+ .hardware_enable = vmx_hardware_enable,
+ .hardware_disable = vmx_hardware_disable,
.has_emulated_msr = vmx_has_emulated_msr,
.vm_size = sizeof(struct kvm_vmx),
.vm_init = vmx_vm_init,
- .vcpu_create = vmx_create_vcpu,
- .vcpu_free = vmx_free_vcpu,
+ .vcpu_create = vmx_vcpu_create,
+ .vcpu_free = vmx_vcpu_free,
.vcpu_reset = vmx_vcpu_reset,
- .prepare_guest_switch = vmx_prepare_switch_to_guest,
+ .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
@@ -7749,21 +7746,21 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.set_rflags = vmx_set_rflags,
.get_if_flag = vmx_get_if_flag,
- .tlb_flush_all = vmx_flush_tlb_all,
- .tlb_flush_current = vmx_flush_tlb_current,
- .tlb_flush_gva = vmx_flush_tlb_gva,
- .tlb_flush_guest = vmx_flush_tlb_guest,
+ .flush_tlb_all = vmx_flush_tlb_all,
+ .flush_tlb_current = vmx_flush_tlb_current,
+ .flush_tlb_gva = vmx_flush_tlb_gva,
+ .flush_tlb_guest = vmx_flush_tlb_guest,
.vcpu_pre_run = vmx_vcpu_pre_run,
- .run = vmx_vcpu_run,
+ .vcpu_run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
.skip_emulated_instruction = vmx_skip_emulated_instruction,
.update_emulated_instruction = vmx_update_emulated_instruction,
.set_interrupt_shadow = vmx_set_interrupt_shadow,
.get_interrupt_shadow = vmx_get_interrupt_shadow,
.patch_hypercall = vmx_patch_hypercall,
- .set_irq = vmx_inject_irq,
- .set_nmi = vmx_inject_nmi,
+ .inject_irq = vmx_inject_irq,
+ .inject_nmi = vmx_inject_nmi,
.queue_exception = vmx_queue_exception,
.cancel_injection = vmx_cancel_injection,
.interrupt_allowed = vmx_interrupt_allowed,
@@ -7816,8 +7813,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
- .update_pi_irte = pi_update_irte,
- .start_assignment = vmx_pi_start_assignment,
+ .pi_update_irte = vmx_pi_update_irte,
+ .pi_start_assignment = vmx_pi_start_assignment,
#ifdef CONFIG_X86_64
.set_hv_timer = vmx_set_hv_timer,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7131d735b1ef..16d29d41908f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -126,16 +126,15 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
-EXPORT_SYMBOL_GPL(kvm_x86_ops);
#define KVM_X86_OP(func) \
DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \
*(((struct kvm_x86_ops *)0)->func));
-#define KVM_X86_OP_NULL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
-EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current);
static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
@@ -194,6 +193,9 @@ bool __read_mostly enable_pmu = true;
EXPORT_SYMBOL_GPL(enable_pmu);
module_param(enable_pmu, bool, 0444);
+bool __read_mostly eager_page_split = true;
+module_param(eager_page_split, bool, 0644);
+
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -2399,7 +2401,7 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc)
return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
}
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio)
+u64 kvm_scale_tsc(u64 tsc, u64 ratio)
{
u64 _tsc = tsc;
@@ -2414,7 +2416,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
u64 tsc;
- tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
return target_tsc - tsc;
}
@@ -2422,7 +2424,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
return vcpu->arch.l1_tsc_offset +
- kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
+ kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
@@ -2625,7 +2627,7 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
{
if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
WARN_ON(adjustment < 0);
- adjustment = kvm_scale_tsc(vcpu, (u64) adjustment,
+ adjustment = kvm_scale_tsc((u64) adjustment,
vcpu->arch.l1_tsc_scaling_ratio);
adjust_tsc_offset_guest(vcpu, adjustment);
}
@@ -3045,7 +3047,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
if (kvm_has_tsc_control)
- tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz,
+ tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
v->arch.l1_tsc_scaling_ratio);
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
@@ -3268,7 +3270,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_all)(vcpu);
+ static_call(kvm_x86_flush_tlb_all)(vcpu);
}
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
@@ -3286,14 +3288,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
kvm_mmu_sync_prev_roots(vcpu);
}
- static_call(kvm_x86_tlb_flush_guest)(vcpu);
+ static_call(kvm_x86_flush_tlb_guest)(vcpu);
}
static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_current)(vcpu);
+ static_call(kvm_x86_flush_tlb_current)(vcpu);
}
/*
@@ -3857,7 +3859,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
ratio = vcpu->arch.tsc_scaling_ratio;
}
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset;
+ msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
break;
}
case MSR_MTRRcap:
@@ -4233,6 +4235,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
case KVM_CAP_VCPU_ATTRIBUTES:
case KVM_CAP_SYS_ATTRIBUTES:
+ case KVM_CAP_VAPIC:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -4273,9 +4276,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
*/
r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
break;
- case KVM_CAP_VAPIC:
- r = !static_call(kvm_x86_cpu_has_accelerated_tpr)();
- break;
case KVM_CAP_NR_VCPUS:
r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
break;
@@ -5132,7 +5132,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
kvm->arch.last_tsc_offset == offset);
- tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
ns = get_kvmclock_base_ns();
__kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
@@ -5977,15 +5977,18 @@ split_irqchip_unlock:
#endif
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
r = -EINVAL;
- if (kvm_x86_ops.vm_copy_enc_context_from)
- r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
- return r;
+ if (!kvm_x86_ops.vm_copy_enc_context_from)
+ break;
+
+ r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
+ break;
case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
r = -EINVAL;
- if (kvm_x86_ops.vm_move_enc_context_from)
- r = kvm_x86_ops.vm_move_enc_context_from(
- kvm, cap->args[0]);
- return r;
+ if (!kvm_x86_ops.vm_move_enc_context_from)
+ break;
+
+ r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
+ break;
case KVM_CAP_EXIT_HYPERCALL:
if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
r = -EINVAL;
@@ -6460,8 +6463,10 @@ set_pit2_out:
break;
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_op)
- r = static_call(kvm_x86_mem_enc_op)(kvm, argp);
+ if (!kvm_x86_ops.mem_enc_ioctl)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -6472,8 +6477,10 @@ set_pit2_out:
goto out;
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_reg_region)
- r = static_call(kvm_x86_mem_enc_reg_region)(kvm, &region);
+ if (!kvm_x86_ops.mem_enc_register_region)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -6484,8 +6491,10 @@ set_pit2_out:
goto out;
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_unreg_region)
- r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, &region);
+ if (!kvm_x86_ops.mem_enc_unregister_region)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
break;
}
case KVM_HYPERV_EVENTFD: {
@@ -8413,8 +8422,7 @@ writeback:
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
- if (kvm_x86_ops.update_emulated_instruction)
- static_call(kvm_x86_update_emulated_instruction)(vcpu);
+ static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -8965,7 +8973,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
*
* @apicid - apicid of vcpu to be kicked.
*/
-static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
{
struct kvm_lapic_irq lapic_irq;
@@ -9084,7 +9092,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
break;
- kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+ kvm_pv_kick_cpu_op(vcpu->kvm, a1);
kvm_sched_yield(vcpu, a1);
ret = 0;
break;
@@ -9254,10 +9262,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
*/
else if (!vcpu->arch.exception.pending) {
if (vcpu->arch.nmi_injected) {
- static_call(kvm_x86_set_nmi)(vcpu);
+ static_call(kvm_x86_inject_nmi)(vcpu);
can_inject = false;
} else if (vcpu->arch.interrupt.injected) {
- static_call(kvm_x86_set_irq)(vcpu);
+ static_call(kvm_x86_inject_irq)(vcpu);
can_inject = false;
}
}
@@ -9337,7 +9345,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
- static_call(kvm_x86_set_nmi)(vcpu);
+ static_call(kvm_x86_inject_nmi)(vcpu);
can_inject = false;
WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
}
@@ -9351,7 +9359,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
goto out;
if (r) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- static_call(kvm_x86_set_irq)(vcpu);
+ static_call(kvm_x86_inject_irq)(vcpu);
WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
}
if (kvm_cpu_has_injectable_intr(vcpu))
@@ -9679,8 +9687,7 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
- if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
- !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
+ if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
return;
old = new = kvm->arch.apicv_inhibit_reasons;
@@ -9713,10 +9720,12 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
} else
kvm->arch.apicv_inhibit_reasons = new;
}
-EXPORT_SYMBOL_GPL(__kvm_request_apicv_update);
void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
{
+ if (!enable_apicv)
+ return;
+
down_write(&kvm->arch.apicv_update_lock);
__kvm_request_apicv_update(kvm, activate, bit);
up_write(&kvm->arch.apicv_update_lock);
@@ -9755,11 +9764,11 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
bitmap_or((ulong *)eoi_exit_bitmap,
vcpu->arch.ioapic_handled_vectors,
to_hv_synic(vcpu)->vec_bitmap, 256);
- static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
return;
}
- static_call(kvm_x86_load_eoi_exitmap)(
+ static_call_cond(kvm_x86_load_eoi_exitmap)(
vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
}
@@ -9782,10 +9791,7 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
- if (!kvm_x86_ops.set_apic_access_page_addr)
- return;
-
- static_call(kvm_x86_set_apic_access_page_addr)(vcpu);
+ static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
}
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -9975,7 +9981,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
- static_call(kvm_x86_prepare_guest_switch)(vcpu);
+ static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -9983,7 +9989,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* result in virtual interrupt delivery.
*/
local_irq_disable();
- vcpu->mode = IN_GUEST_MODE;
+
+ /* Store vcpu->apicv_active before vcpu->mode. */
+ smp_store_release(&vcpu->mode, IN_GUEST_MODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -10054,7 +10062,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
- exit_fastpath = static_call(kvm_x86_run)(vcpu);
+ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -10357,10 +10365,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- /*
- * Exclude PKRU from restore as restored separately in
- * kvm_x86_ops.run().
- */
+ /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
trace_kvm_fpu(1);
}
@@ -10543,16 +10548,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
return 0;
}
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
- struct kvm_segment cs;
-
- kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
- *db = cs.db;
- *l = cs.l;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-
static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct desc_ptr dt;
@@ -11975,6 +11970,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
if (kvm_dirty_log_manual_protect_and_init_set(kvm))
return;
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
+
if (kvm_x86_ops.cpu_dirty_log_size) {
kvm_mmu_slot_leaf_clear_dirty(kvm, new);
kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
@@ -12019,8 +12017,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&
- kvm_x86_ops.guest_apic_has_interrupt &&
- static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
+ static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
}
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
@@ -12375,7 +12372,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
void kvm_arch_start_assignment(struct kvm *kvm)
{
if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- static_call_cond(kvm_x86_start_assignment)(kvm);
+ static_call_cond(kvm_x86_pi_start_assignment)(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
@@ -12423,7 +12420,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
- ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm,
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
if (ret)
@@ -12448,7 +12445,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
@@ -12459,7 +12456,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set);
+ return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
}
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 767ec7f99516..aa86abad914d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -307,6 +307,8 @@ extern int pi_inject_timer;
extern bool report_ignored_msrs;
+extern bool eager_page_split;
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index bad57535fad0..4aa0f2b31665 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -133,32 +133,57 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
+ struct gfn_to_hva_cache *ghc = &vx->runstate_cache;
+ struct kvm_memslots *slots = kvm_memslots(v->kvm);
+ bool atomic = (state == RUNSTATE_runnable);
uint64_t state_entry_time;
- unsigned int offset;
+ int __user *user_state;
+ uint64_t __user *user_times;
kvm_xen_update_runstate(v, state);
if (!vx->runstate_set)
return;
- BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+ if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) &&
+ kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len))
+ return;
+
+ /* We made sure it fits in a single page */
+ BUG_ON(!ghc->memslot);
+
+ if (atomic)
+ pagefault_disable();
- offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
-#ifdef CONFIG_X86_64
/*
- * The only difference is alignment of uint64_t in 32-bit.
- * So the first field 'state' is accessed directly using
- * offsetof() (where its offset happens to be zero), while the
- * remaining fields which are all uint64_t, start at 'offset'
- * which we tweak here by adding 4.
+ * The only difference between 32-bit and 64-bit versions of the
+ * runstate struct us the alignment of uint64_t in 32-bit, which
+ * means that the 64-bit version has an additional 4 bytes of
+ * padding after the first field 'state'.
+ *
+ * So we use 'int __user *user_state' to point to the state field,
+ * and 'uint64_t __user *user_times' for runstate_entry_time. So
+ * the actual array of time[] in each state starts at user_times[1].
*/
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
+ user_state = (int __user *)ghc->hva;
+
+ BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+
+ user_times = (uint64_t __user *)(ghc->hva +
+ offsetof(struct compat_vcpu_runstate_info,
+ state_entry_time));
+#ifdef CONFIG_X86_64
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
offsetof(struct compat_vcpu_runstate_info, time) + 4);
if (v->kvm->arch.xen.long_mode)
- offset = offsetof(struct vcpu_runstate_info, state_entry_time);
+ user_times = (uint64_t __user *)(ghc->hva +
+ offsetof(struct vcpu_runstate_info,
+ state_entry_time));
#endif
/*
* First write the updated state_entry_time at the appropriate
@@ -172,10 +197,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
sizeof(state_entry_time));
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &state_entry_time, offset,
- sizeof(state_entry_time)))
- return;
+ if (__put_user(state_entry_time, user_times))
+ goto out;
smp_wmb();
/*
@@ -189,11 +212,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &vx->current_runstate,
- offsetof(struct vcpu_runstate_info, state),
- sizeof(vx->current_runstate)))
- return;
+ if (__put_user(vx->current_runstate, user_state))
+ goto out;
/*
* Write the actual runstate times immediately after the
@@ -208,24 +228,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times));
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &vx->runstate_times[0],
- offset + sizeof(u64),
- sizeof(vx->runstate_times)))
- return;
-
+ if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)))
+ goto out;
smp_wmb();
/*
* Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
* runstate_entry_time field.
*/
-
state_entry_time &= ~XEN_RUNSTATE_UPDATE;
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &state_entry_time, offset,
- sizeof(state_entry_time)))
- return;
+ __put_user(state_entry_time, user_times);
+ smp_wmb();
+
+ out:
+ mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+
+ if (atomic)
+ pagefault_enable();
}
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
@@ -443,6 +462,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
+ /* It must fit within a single page */
+ if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
+ r = -EINVAL;
+ break;
+ }
+
r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_info_cache,
data->u.gpa,
@@ -460,6 +485,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
+ /* It must fit within a single page */
+ if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) {
+ r = -EINVAL;
+ break;
+ }
+
r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_time_info_cache,
data->u.gpa,
@@ -481,6 +512,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
+ /* It must fit within a single page */
+ if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) {
+ r = -EINVAL;
+ break;
+ }
+
r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.runstate_cache,
data->u.gpa,
@@ -695,7 +732,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
instructions[0] = 0xb8;
/* vmcall / vmmcall */
- kvm_x86_ops.patch_hypercall(vcpu, instructions + 5);
+ static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5);
/* ret */
instructions[8] = 0xc3;
@@ -830,7 +867,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_XEN;
vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
vcpu->run->xen.u.hcall.longmode = longmode;
- vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu);
+ vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu);
vcpu->run->xen.u.hcall.input = input;
vcpu->run->xen.u.hcall.params[0] = params[0];
vcpu->run->xen.u.hcall.params[1] = params[1];
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index 8f97c2927bee..fdce7a4cfc6f 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -183,11 +183,18 @@ enum HV_GENERIC_SET_FORMAT {
#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
#define HV_HYPERCALL_FAST_BIT BIT(16)
#define HV_HYPERCALL_VARHEAD_OFFSET 17
+#define HV_HYPERCALL_VARHEAD_MASK GENMASK_ULL(26, 17)
+#define HV_HYPERCALL_RSVD0_MASK GENMASK_ULL(31, 27)
#define HV_HYPERCALL_REP_COMP_OFFSET 32
#define HV_HYPERCALL_REP_COMP_1 BIT_ULL(32)
#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32)
+#define HV_HYPERCALL_RSVD1_MASK GENMASK_ULL(47, 44)
#define HV_HYPERCALL_REP_START_OFFSET 48
#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48)
+#define HV_HYPERCALL_RSVD2_MASK GENMASK_ULL(63, 60)
+#define HV_HYPERCALL_RSVD_MASK (HV_HYPERCALL_RSVD0_MASK | \
+ HV_HYPERCALL_RSVD1_MASK | \
+ HV_HYPERCALL_RSVD2_MASK)
/* hypercall status code */
#define HV_STATUS_SUCCESS 0
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 43ba0b1e0edb..f2b8baea35d2 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -503,6 +503,7 @@ long __static_call_return0(void)
{
return 0;
}
+EXPORT_SYMBOL_GPL(__static_call_return0);
#ifdef CONFIG_STATIC_CALL_SELFTEST
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 086f490e808d..f9943b96ab3f 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -51,6 +51,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test
TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test
TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
@@ -82,7 +83,6 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test
TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_pi_mmio_test
TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
TEST_GEN_PROGS_x86_64 += x86_64/amx_test
TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index 1954b964d1cf..101759ac93a4 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -298,12 +298,18 @@ static void run_test(enum vm_guest_mode mode, void *arg)
static void help(char *name)
{
puts("");
- printf("usage: %s [-h] [-i iterations] [-p offset] "
+ printf("usage: %s [-h] [-i iterations] [-p offset] [-g]"
"[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]"
"[-x memslots]\n", name);
puts("");
printf(" -i: specify iteration counts (default: %"PRIu64")\n",
TEST_HOST_LOOP_N);
+ printf(" -g: Do not enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2. This\n"
+ " makes KVM_GET_DIRTY_LOG clear the dirty log (i.e.\n"
+ " KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE is not enabled)\n"
+ " and writes will be tracked as soon as dirty logging is\n"
+ " enabled on the memslot (i.e. KVM_DIRTY_LOG_INITIALLY_SET\n"
+ " is not enabled).\n");
printf(" -p: specify guest physical test memory offset\n"
" Warning: a low offset can conflict with the loaded test code.\n");
guest_modes_help();
@@ -343,8 +349,11 @@ int main(int argc, char *argv[])
guest_modes_append_default();
- while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:x:")) != -1) {
+ while ((opt = getopt(argc, argv, "ghi:p:m:b:f:v:os:x:")) != -1) {
switch (opt) {
+ case 'g':
+ dirty_log_manual_caps = 0;
+ break;
case 'i':
p.iterations = atoi(optarg);
break;
diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h
index c9af97abd622..cc5d14a45702 100644
--- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h
+++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h
@@ -213,6 +213,25 @@ struct hv_enlightened_vmcs {
u64 padding64_6[7];
};
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF
+
#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001
#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12
@@ -648,381 +667,507 @@ static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value)
switch (encoding) {
case GUEST_RIP:
current_evmcs->guest_rip = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case GUEST_RSP:
current_evmcs->guest_rsp = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC;
break;
case GUEST_RFLAGS:
current_evmcs->guest_rflags = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC;
break;
case HOST_IA32_PAT:
current_evmcs->host_ia32_pat = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_IA32_EFER:
current_evmcs->host_ia32_efer = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_CR0:
current_evmcs->host_cr0 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_CR3:
current_evmcs->host_cr3 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_CR4:
current_evmcs->host_cr4 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_IA32_SYSENTER_ESP:
current_evmcs->host_ia32_sysenter_esp = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_IA32_SYSENTER_EIP:
current_evmcs->host_ia32_sysenter_eip = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_RIP:
current_evmcs->host_rip = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case IO_BITMAP_A:
current_evmcs->io_bitmap_a = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP;
break;
case IO_BITMAP_B:
current_evmcs->io_bitmap_b = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP;
break;
case MSR_BITMAP:
current_evmcs->msr_bitmap = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
break;
case GUEST_ES_BASE:
current_evmcs->guest_es_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_CS_BASE:
current_evmcs->guest_cs_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_SS_BASE:
current_evmcs->guest_ss_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_DS_BASE:
current_evmcs->guest_ds_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_FS_BASE:
current_evmcs->guest_fs_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_GS_BASE:
current_evmcs->guest_gs_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_LDTR_BASE:
current_evmcs->guest_ldtr_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_TR_BASE:
current_evmcs->guest_tr_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_GDTR_BASE:
current_evmcs->guest_gdtr_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_IDTR_BASE:
current_evmcs->guest_idtr_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case TSC_OFFSET:
current_evmcs->tsc_offset = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
break;
case VIRTUAL_APIC_PAGE_ADDR:
current_evmcs->virtual_apic_page_addr = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
break;
case VMCS_LINK_POINTER:
current_evmcs->vmcs_link_pointer = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case GUEST_IA32_DEBUGCTL:
current_evmcs->guest_ia32_debugctl = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case GUEST_IA32_PAT:
current_evmcs->guest_ia32_pat = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case GUEST_IA32_EFER:
current_evmcs->guest_ia32_efer = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case GUEST_PDPTR0:
current_evmcs->guest_pdptr0 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case GUEST_PDPTR1:
current_evmcs->guest_pdptr1 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case GUEST_PDPTR2:
current_evmcs->guest_pdptr2 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case GUEST_PDPTR3:
current_evmcs->guest_pdptr3 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case GUEST_PENDING_DBG_EXCEPTIONS:
current_evmcs->guest_pending_dbg_exceptions = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case GUEST_SYSENTER_ESP:
current_evmcs->guest_sysenter_esp = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case GUEST_SYSENTER_EIP:
current_evmcs->guest_sysenter_eip = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case CR0_GUEST_HOST_MASK:
current_evmcs->cr0_guest_host_mask = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
break;
case CR4_GUEST_HOST_MASK:
current_evmcs->cr4_guest_host_mask = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
break;
case CR0_READ_SHADOW:
current_evmcs->cr0_read_shadow = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
break;
case CR4_READ_SHADOW:
current_evmcs->cr4_read_shadow = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
break;
case GUEST_CR0:
current_evmcs->guest_cr0 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
break;
case GUEST_CR3:
current_evmcs->guest_cr3 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
break;
case GUEST_CR4:
current_evmcs->guest_cr4 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
break;
case GUEST_DR7:
current_evmcs->guest_dr7 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
break;
case HOST_FS_BASE:
current_evmcs->host_fs_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
break;
case HOST_GS_BASE:
current_evmcs->host_gs_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
break;
case HOST_TR_BASE:
current_evmcs->host_tr_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
break;
case HOST_GDTR_BASE:
current_evmcs->host_gdtr_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
break;
case HOST_IDTR_BASE:
current_evmcs->host_idtr_base = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
break;
case HOST_RSP:
current_evmcs->host_rsp = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
break;
case EPT_POINTER:
current_evmcs->ept_pointer = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT;
break;
case GUEST_BNDCFGS:
current_evmcs->guest_bndcfgs = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case XSS_EXIT_BITMAP:
current_evmcs->xss_exit_bitmap = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
break;
case GUEST_PHYSICAL_ADDRESS:
current_evmcs->guest_physical_address = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case EXIT_QUALIFICATION:
current_evmcs->exit_qualification = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case GUEST_LINEAR_ADDRESS:
current_evmcs->guest_linear_address = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case VM_EXIT_MSR_STORE_ADDR:
current_evmcs->vm_exit_msr_store_addr = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case VM_EXIT_MSR_LOAD_ADDR:
current_evmcs->vm_exit_msr_load_addr = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case VM_ENTRY_MSR_LOAD_ADDR:
current_evmcs->vm_entry_msr_load_addr = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case CR3_TARGET_VALUE0:
current_evmcs->cr3_target_value0 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case CR3_TARGET_VALUE1:
current_evmcs->cr3_target_value1 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case CR3_TARGET_VALUE2:
current_evmcs->cr3_target_value2 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case CR3_TARGET_VALUE3:
current_evmcs->cr3_target_value3 = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case TPR_THRESHOLD:
current_evmcs->tpr_threshold = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case GUEST_INTERRUPTIBILITY_INFO:
current_evmcs->guest_interruptibility_info = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC;
break;
case CPU_BASED_VM_EXEC_CONTROL:
current_evmcs->cpu_based_vm_exec_control = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC;
break;
case EXCEPTION_BITMAP:
current_evmcs->exception_bitmap = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN;
break;
case VM_ENTRY_CONTROLS:
current_evmcs->vm_entry_controls = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY;
break;
case VM_ENTRY_INTR_INFO_FIELD:
current_evmcs->vm_entry_intr_info_field = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT;
break;
case VM_ENTRY_EXCEPTION_ERROR_CODE:
current_evmcs->vm_entry_exception_error_code = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT;
break;
case VM_ENTRY_INSTRUCTION_LEN:
current_evmcs->vm_entry_instruction_len = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT;
break;
case HOST_IA32_SYSENTER_CS:
current_evmcs->host_ia32_sysenter_cs = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case PIN_BASED_VM_EXEC_CONTROL:
current_evmcs->pin_based_vm_exec_control = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1;
break;
case VM_EXIT_CONTROLS:
current_evmcs->vm_exit_controls = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1;
break;
case SECONDARY_VM_EXEC_CONTROL:
current_evmcs->secondary_vm_exec_control = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1;
break;
case GUEST_ES_LIMIT:
current_evmcs->guest_es_limit = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_CS_LIMIT:
current_evmcs->guest_cs_limit = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_SS_LIMIT:
current_evmcs->guest_ss_limit = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_DS_LIMIT:
current_evmcs->guest_ds_limit = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_FS_LIMIT:
current_evmcs->guest_fs_limit = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_GS_LIMIT:
current_evmcs->guest_gs_limit = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_LDTR_LIMIT:
current_evmcs->guest_ldtr_limit = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_TR_LIMIT:
current_evmcs->guest_tr_limit = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_GDTR_LIMIT:
current_evmcs->guest_gdtr_limit = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_IDTR_LIMIT:
current_evmcs->guest_idtr_limit = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_ES_AR_BYTES:
current_evmcs->guest_es_ar_bytes = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_CS_AR_BYTES:
current_evmcs->guest_cs_ar_bytes = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_SS_AR_BYTES:
current_evmcs->guest_ss_ar_bytes = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_DS_AR_BYTES:
current_evmcs->guest_ds_ar_bytes = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_FS_AR_BYTES:
current_evmcs->guest_fs_ar_bytes = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_GS_AR_BYTES:
current_evmcs->guest_gs_ar_bytes = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_LDTR_AR_BYTES:
current_evmcs->guest_ldtr_ar_bytes = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_TR_AR_BYTES:
current_evmcs->guest_tr_ar_bytes = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_ACTIVITY_STATE:
current_evmcs->guest_activity_state = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case GUEST_SYSENTER_CS:
current_evmcs->guest_sysenter_cs = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
break;
case VM_INSTRUCTION_ERROR:
current_evmcs->vm_instruction_error = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case VM_EXIT_REASON:
current_evmcs->vm_exit_reason = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case VM_EXIT_INTR_INFO:
current_evmcs->vm_exit_intr_info = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case VM_EXIT_INTR_ERROR_CODE:
current_evmcs->vm_exit_intr_error_code = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case IDT_VECTORING_INFO_FIELD:
current_evmcs->idt_vectoring_info_field = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case IDT_VECTORING_ERROR_CODE:
current_evmcs->idt_vectoring_error_code = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case VM_EXIT_INSTRUCTION_LEN:
current_evmcs->vm_exit_instruction_len = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case VMX_INSTRUCTION_INFO:
current_evmcs->vmx_instruction_info = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
break;
case PAGE_FAULT_ERROR_CODE_MASK:
current_evmcs->page_fault_error_code_mask = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case PAGE_FAULT_ERROR_CODE_MATCH:
current_evmcs->page_fault_error_code_match = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case CR3_TARGET_COUNT:
current_evmcs->cr3_target_count = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case VM_EXIT_MSR_STORE_COUNT:
current_evmcs->vm_exit_msr_store_count = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case VM_EXIT_MSR_LOAD_COUNT:
current_evmcs->vm_exit_msr_load_count = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case VM_ENTRY_MSR_LOAD_COUNT:
current_evmcs->vm_entry_msr_load_count = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
break;
case HOST_ES_SELECTOR:
current_evmcs->host_es_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_CS_SELECTOR:
current_evmcs->host_cs_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_SS_SELECTOR:
current_evmcs->host_ss_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_DS_SELECTOR:
current_evmcs->host_ds_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_FS_SELECTOR:
current_evmcs->host_fs_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_GS_SELECTOR:
current_evmcs->host_gs_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case HOST_TR_SELECTOR:
current_evmcs->host_tr_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
break;
case GUEST_ES_SELECTOR:
current_evmcs->guest_es_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_CS_SELECTOR:
current_evmcs->guest_cs_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_SS_SELECTOR:
current_evmcs->guest_ss_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_DS_SELECTOR:
current_evmcs->guest_ds_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_FS_SELECTOR:
current_evmcs->guest_fs_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_GS_SELECTOR:
current_evmcs->guest_gs_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_LDTR_SELECTOR:
current_evmcs->guest_ldtr_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case GUEST_TR_SELECTOR:
current_evmcs->guest_tr_selector = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
break;
case VIRTUAL_PROCESSOR_ID:
current_evmcs->virtual_processor_id = value;
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT;
break;
default: return 1;
}
@@ -1070,7 +1215,10 @@ static inline int evmcs_vmresume(void)
{
int ret;
- current_evmcs->hv_clean_fields = 0;
+ /* HOST_RIP */
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+ /* HOST_RSP */
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
__asm__ __volatile__("push %%rbp;"
"push %%rcx;"
diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h
index f4ea2355dbc2..2225e5077350 100644
--- a/tools/testing/selftests/kvm/include/x86_64/svm.h
+++ b/tools/testing/selftests/kvm/include/x86_64/svm.h
@@ -99,7 +99,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u8 reserved_6[8]; /* Offset 0xe8 */
u64 avic_logical_id; /* Offset 0xf0 */
u64 avic_physical_id; /* Offset 0xf8 */
- u8 reserved_7[768];
+ u8 reserved_7[8];
+ u64 vmsa_pa; /* Used for an SEV-ES guest */
+ u8 reserved_8[720];
+ /*
+ * Offset 0x3e0, 32 bytes reserved
+ * for use by hypervisor/software.
+ */
+ u8 reserved_sw[32];
};
diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h
index 587fbe408b99..a25aabd8f5e7 100644
--- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h
+++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h
@@ -16,6 +16,7 @@
#define CPUID_SVM_BIT 2
#define CPUID_SVM BIT_ULL(CPUID_SVM_BIT)
+#define SVM_EXIT_MSR 0x07c
#define SVM_EXIT_VMMCALL 0x081
struct svm_test_data {
@@ -28,6 +29,11 @@ struct svm_test_data {
struct vmcb_save_area *save_area; /* gva */
void *save_area_hva;
uint64_t save_area_gpa;
+
+ /* MSR-Bitmap */
+ void *msr; /* gva */
+ void *msr_hva;
+ uint64_t msr_gpa;
};
struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c
index 0ebc03ce079c..736ee4a23df6 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/svm.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c
@@ -43,6 +43,11 @@ vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva)
svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area);
svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area);
+ svm->msr = (void *)vm_vaddr_alloc_page(vm);
+ svm->msr_hva = addr_gva2hva(vm, (uintptr_t)svm->msr);
+ svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr);
+ memset(svm->msr_hva, 0, getpagesize());
+
*p_svm_gva = svm_gva;
return svm;
}
@@ -106,6 +111,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r
save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR);
ctrl->intercept = (1ULL << INTERCEPT_VMRUN) |
(1ULL << INTERCEPT_VMMCALL);
+ ctrl->msrpm_base_pa = svm->msr_gpa;
vmcb->save.rip = (u64)guest_rip;
vmcb->save.rsp = (u64)guest_rsp;
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
index 4c7841dfd481..d12e043aa2ee 100644
--- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
+#include <linux/bitmap.h>
#include "test_util.h"
@@ -32,6 +33,22 @@ static void guest_nmi_handler(struct ex_regs *regs)
{
}
+/* Exits to L1 destroy GRPs! */
+static inline void rdmsr_fs_base(void)
+{
+ __asm__ __volatile__ ("mov $0xc0000100, %%rcx; rdmsr" : : :
+ "rax", "rbx", "rcx", "rdx",
+ "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
+ "r13", "r14", "r15");
+}
+static inline void rdmsr_gs_base(void)
+{
+ __asm__ __volatile__ ("mov $0xc0000101, %%rcx; rdmsr" : : :
+ "rax", "rbx", "rcx", "rdx",
+ "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
+ "r13", "r14", "r15");
+}
+
void l2_guest_code(void)
{
GUEST_SYNC(7);
@@ -41,6 +58,15 @@ void l2_guest_code(void)
/* Forced exit to L1 upon restore */
GUEST_SYNC(9);
+ vmcall();
+
+ /* MSR-Bitmap tests */
+ rdmsr_fs_base(); /* intercepted */
+ rdmsr_fs_base(); /* intercepted */
+ rdmsr_gs_base(); /* not intercepted */
+ vmcall();
+ rdmsr_gs_base(); /* intercepted */
+
/* Done, exit to L1 and never come back. */
vmcall();
}
@@ -76,8 +102,9 @@ void guest_code(struct vmx_pages *vmx_pages)
current_evmcs->revision_id = EVMCS_VERSION;
GUEST_SYNC(6);
- current_evmcs->pin_based_vm_exec_control |=
- PIN_BASED_NMI_EXITING;
+ vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
+ PIN_BASED_NMI_EXITING);
+
GUEST_ASSERT(!vmlaunch());
GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
@@ -91,6 +118,39 @@ void guest_code(struct vmx_pages *vmx_pages)
GUEST_SYNC(10);
GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+ current_evmcs->guest_rip += 3; /* vmcall */
+
+ /* Intercept RDMSR 0xc0000100 */
+ vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmreadz(CPU_BASED_VM_EXEC_CONTROL) |
+ CPU_BASED_USE_MSR_BITMAPS);
+ set_bit(MSR_FS_BASE & 0x1fff, vmx_pages->msr + 0x400);
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+ current_evmcs->guest_rip += 2; /* rdmsr */
+
+ /* Enable enlightened MSR bitmap */
+ current_evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+ current_evmcs->guest_rip += 2; /* rdmsr */
+
+ /* Intercept RDMSR 0xc0000101 without telling KVM about it */
+ set_bit(MSR_GS_BASE & 0x1fff, vmx_pages->msr + 0x400);
+ /* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */
+ current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ GUEST_ASSERT(!vmresume());
+ /* Make sure we don't see EXIT_REASON_MSR_READ here so eMSR bitmap works */
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+ current_evmcs->guest_rip += 3; /* vmcall */
+
+ /* Now tell KVM we've changed MSR-Bitmap */
+ current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+ current_evmcs->guest_rip += 2; /* rdmsr */
+
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
GUEST_SYNC(11);
/* Try enlightened vmptrld with an incorrect GPA */
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
index 7e2d2d17d2ed..8c245ab2d98a 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
@@ -49,16 +49,13 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
bool evmcs_expected)
{
int i;
- int nent = 9;
+ int nent_expected = 10;
u32 test_val;
- if (evmcs_expected)
- nent += 1; /* 0x4000000A */
-
- TEST_ASSERT(hv_cpuid_entries->nent == nent,
+ TEST_ASSERT(hv_cpuid_entries->nent == nent_expected,
"KVM_GET_SUPPORTED_HV_CPUID should return %d entries"
- " with evmcs=%d (returned %d)",
- nent, evmcs_expected, hv_cpuid_entries->nent);
+ " (returned %d)",
+ nent_expected, hv_cpuid_entries->nent);
for (i = 0; i < hv_cpuid_entries->nent; i++) {
struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i];
@@ -68,9 +65,6 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
"function %x is our of supported range",
entry->function);
- TEST_ASSERT(evmcs_expected || (entry->function != 0x4000000A),
- "0x4000000A leaf should not be reported");
-
TEST_ASSERT(entry->index == 0,
".index field should be zero");
@@ -97,8 +91,20 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
"NoNonArchitecturalCoreSharing bit"
" doesn't reflect SMT setting");
break;
- }
+ case 0x4000000A:
+ TEST_ASSERT(entry->eax & (1UL << 19),
+ "Enlightened MSR-Bitmap should always be supported"
+ " 0x40000000.EAX: %x", entry->eax);
+ if (evmcs_expected)
+ TEST_ASSERT((entry->eax & 0xffff) == 0x101,
+ "Supported Enlightened VMCS version range is supposed to be 1:1"
+ " 0x40000000.EAX: %x", entry->eax);
+
+ break;
+ default:
+ break;
+ }
/*
* If needed for debug:
* fprintf(stdout,
@@ -107,7 +113,6 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
* entry->edx);
*/
}
-
}
void test_hv_cpuid_e2big(struct kvm_vm *vm, bool system)
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
new file mode 100644
index 000000000000..21f5ca9197da
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM_GET/SET_* tests
+ *
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ * Tests for Hyper-V extensions to SVM.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <linux/bitmap.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "hyperv.h"
+
+#define VCPU_ID 1
+#define L2_GUEST_STACK_SIZE 256
+
+struct hv_enlightenments {
+ struct __packed hv_enlightenments_control {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 enlightened_npt_tlb: 1;
+ u32 reserved:29;
+ } __packed hv_enlightenments_control;
+ u32 hv_vp_id;
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 reserved;
+} __packed;
+
+/*
+ * Hyper-V uses the software reserved clean bit in VMCB
+ */
+#define VMCB_HV_NESTED_ENLIGHTENMENTS (1U << 31)
+
+static inline void vmmcall(void)
+{
+ __asm__ __volatile__("vmmcall");
+}
+
+void l2_guest_code(void)
+{
+ GUEST_SYNC(3);
+ /* Exit to L1 */
+ vmmcall();
+
+ /* MSR-Bitmap tests */
+ rdmsr(MSR_FS_BASE); /* intercepted */
+ rdmsr(MSR_FS_BASE); /* intercepted */
+ rdmsr(MSR_GS_BASE); /* not intercepted */
+ vmmcall();
+ rdmsr(MSR_GS_BASE); /* intercepted */
+
+ GUEST_SYNC(5);
+
+ /* Done, exit to L1 and never come back. */
+ vmmcall();
+}
+
+static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm)
+{
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ struct vmcb *vmcb = svm->vmcb;
+ struct hv_enlightenments *hve =
+ (struct hv_enlightenments *)vmcb->control.reserved_sw;
+
+ GUEST_SYNC(1);
+
+ wrmsr(HV_X64_MSR_GUEST_OS_ID, (u64)0x8100 << 48);
+
+ GUEST_ASSERT(svm->vmcb_gpa);
+ /* Prepare for L2 execution. */
+ generic_svm_setup(svm, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_SYNC(2);
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+ GUEST_SYNC(4);
+ vmcb->save.rip += 3;
+
+ /* Intercept RDMSR 0xc0000100 */
+ vmcb->control.intercept |= 1ULL << INTERCEPT_MSR_PROT;
+ set_bit(2 * (MSR_FS_BASE & 0x1fff), svm->msr + 0x800);
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+ vmcb->save.rip += 2; /* rdmsr */
+
+ /* Enable enlightened MSR bitmap */
+ hve->hv_enlightenments_control.msr_bitmap = 1;
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+ vmcb->save.rip += 2; /* rdmsr */
+
+ /* Intercept RDMSR 0xc0000101 without telling KVM about it */
+ set_bit(2 * (MSR_GS_BASE & 0x1fff), svm->msr + 0x800);
+ /* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */
+ vmcb->control.clean |= VMCB_HV_NESTED_ENLIGHTENMENTS;
+ run_guest(vmcb, svm->vmcb_gpa);
+ /* Make sure we don't see SVM_EXIT_MSR here so eMSR bitmap works */
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+ vmcb->save.rip += 3; /* vmcall */
+
+ /* Now tell KVM we've changed MSR-Bitmap */
+ vmcb->control.clean &= ~VMCB_HV_NESTED_ENLIGHTENMENTS;
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+ vmcb->save.rip += 2; /* rdmsr */
+
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+ GUEST_SYNC(6);
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t nested_gva = 0;
+
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct ucall uc;
+ int stage;
+
+ if (!nested_svm_supported()) {
+ print_skip("Nested SVM not supported");
+ exit(KSFT_SKIP);
+ }
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_hv_cpuid(vm, VCPU_ID);
+ run = vcpu_state(vm, VCPU_ID);
+ vcpu_alloc_svm(vm, &nested_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, nested_gva);
+
+ for (stage = 1;; stage++) {
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Stage %d: unexpected exit reason: %u (%s),\n",
+ stage, run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ /* UCALL_SYNC is handled here. */
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage, (ulong)uc.args[1]);
+
+ }
+
+done:
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index 80056bbbb003..d1dc1acf997c 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -21,6 +21,8 @@
#define NR_LOCK_TESTING_THREADS 3
#define NR_LOCK_TESTING_ITERATIONS 10000
+bool have_sev_es;
+
static int __sev_ioctl(int vm_fd, int cmd_id, void *data, __u32 *fw_error)
{
struct kvm_sev_cmd cmd = {
@@ -172,10 +174,18 @@ static void test_sev_migrate_parameters(void)
*sev_es_vm_no_vmsa;
int ret;
- sev_vm = sev_vm_create(/* es= */ false);
- sev_es_vm = sev_vm_create(/* es= */ true);
vm_no_vcpu = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
vm_no_sev = aux_vm_create(true);
+ ret = __sev_migrate_from(vm_no_vcpu->fd, vm_no_sev->fd);
+ TEST_ASSERT(ret == -1 && errno == EINVAL,
+ "Migrations require SEV enabled. ret %d, errno: %d\n", ret,
+ errno);
+
+ if (!have_sev_es)
+ goto out;
+
+ sev_vm = sev_vm_create(/* es= */ false);
+ sev_es_vm = sev_vm_create(/* es= */ true);
sev_es_vm_no_vmsa = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL);
vm_vcpu_add(sev_es_vm_no_vmsa, 1);
@@ -204,14 +214,10 @@ static void test_sev_migrate_parameters(void)
"SEV-ES migrations require UPDATE_VMSA. ret %d, errno: %d\n",
ret, errno);
- ret = __sev_migrate_from(vm_no_vcpu->fd, vm_no_sev->fd);
- TEST_ASSERT(ret == -1 && errno == EINVAL,
- "Migrations require SEV enabled. ret %d, errno: %d\n", ret,
- errno);
-
kvm_vm_free(sev_vm);
kvm_vm_free(sev_es_vm);
kvm_vm_free(sev_es_vm_no_vmsa);
+out:
kvm_vm_free(vm_no_vcpu);
kvm_vm_free(vm_no_sev);
}
@@ -300,7 +306,6 @@ static void test_sev_mirror_parameters(void)
int ret;
sev_vm = sev_vm_create(/* es= */ false);
- sev_es_vm = sev_vm_create(/* es= */ true);
vm_with_vcpu = aux_vm_create(true);
vm_no_vcpu = aux_vm_create(false);
@@ -310,6 +315,21 @@ static void test_sev_mirror_parameters(void)
"Should not be able copy context to self. ret: %d, errno: %d\n",
ret, errno);
+ ret = __sev_mirror_create(vm_no_vcpu->fd, vm_with_vcpu->fd);
+ TEST_ASSERT(ret == -1 && errno == EINVAL,
+ "Copy context requires SEV enabled. ret %d, errno: %d\n", ret,
+ errno);
+
+ ret = __sev_mirror_create(vm_with_vcpu->fd, sev_vm->fd);
+ TEST_ASSERT(
+ ret == -1 && errno == EINVAL,
+ "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n",
+ ret, errno);
+
+ if (!have_sev_es)
+ goto out;
+
+ sev_es_vm = sev_vm_create(/* es= */ true);
ret = __sev_mirror_create(sev_vm->fd, sev_es_vm->fd);
TEST_ASSERT(
ret == -1 && errno == EINVAL,
@@ -322,63 +342,97 @@ static void test_sev_mirror_parameters(void)
"Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d\n",
ret, errno);
- ret = __sev_mirror_create(vm_no_vcpu->fd, vm_with_vcpu->fd);
- TEST_ASSERT(ret == -1 && errno == EINVAL,
- "Copy context requires SEV enabled. ret %d, errno: %d\n", ret,
- errno);
-
- ret = __sev_mirror_create(vm_with_vcpu->fd, sev_vm->fd);
- TEST_ASSERT(
- ret == -1 && errno == EINVAL,
- "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n",
- ret, errno);
+ kvm_vm_free(sev_es_vm);
+out:
kvm_vm_free(sev_vm);
- kvm_vm_free(sev_es_vm);
kvm_vm_free(vm_with_vcpu);
kvm_vm_free(vm_no_vcpu);
}
static void test_sev_move_copy(void)
{
- struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm;
- int ret;
+ struct kvm_vm *dst_vm, *dst2_vm, *dst3_vm, *sev_vm, *mirror_vm,
+ *dst_mirror_vm, *dst2_mirror_vm, *dst3_mirror_vm;
sev_vm = sev_vm_create(/* es= */ false);
dst_vm = aux_vm_create(true);
+ dst2_vm = aux_vm_create(true);
+ dst3_vm = aux_vm_create(true);
mirror_vm = aux_vm_create(false);
dst_mirror_vm = aux_vm_create(false);
+ dst2_mirror_vm = aux_vm_create(false);
+ dst3_mirror_vm = aux_vm_create(false);
sev_mirror_create(mirror_vm->fd, sev_vm->fd);
- ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
- TEST_ASSERT(ret == -1 && errno == EBUSY,
- "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
- errno);
- /* The mirror itself can be migrated. */
sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
- ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
- TEST_ASSERT(ret == -1 && errno == EBUSY,
- "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
- errno);
+ sev_migrate_from(dst_vm->fd, sev_vm->fd);
+
+ sev_migrate_from(dst2_vm->fd, dst_vm->fd);
+ sev_migrate_from(dst2_mirror_vm->fd, dst_mirror_vm->fd);
+
+ sev_migrate_from(dst3_mirror_vm->fd, dst2_mirror_vm->fd);
+ sev_migrate_from(dst3_vm->fd, dst2_vm->fd);
+
+ kvm_vm_free(dst_vm);
+ kvm_vm_free(sev_vm);
+ kvm_vm_free(dst2_vm);
+ kvm_vm_free(dst3_vm);
+ kvm_vm_free(mirror_vm);
+ kvm_vm_free(dst_mirror_vm);
+ kvm_vm_free(dst2_mirror_vm);
+ kvm_vm_free(dst3_mirror_vm);
/*
- * mirror_vm is not a mirror anymore, dst_mirror_vm is. Thus,
- * the owner can be copied as soon as dst_mirror_vm is gone.
+ * Run similar test be destroy mirrors before mirrored VMs to ensure
+ * destruction is done safely.
*/
- kvm_vm_free(dst_mirror_vm);
+ sev_vm = sev_vm_create(/* es= */ false);
+ dst_vm = aux_vm_create(true);
+ mirror_vm = aux_vm_create(false);
+ dst_mirror_vm = aux_vm_create(false);
+
+ sev_mirror_create(mirror_vm->fd, sev_vm->fd);
+
+ sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
sev_migrate_from(dst_vm->fd, sev_vm->fd);
kvm_vm_free(mirror_vm);
+ kvm_vm_free(dst_mirror_vm);
kvm_vm_free(dst_vm);
kvm_vm_free(sev_vm);
}
+#define X86_FEATURE_SEV (1 << 1)
+#define X86_FEATURE_SEV_ES (1 << 3)
+
int main(int argc, char *argv[])
{
+ struct kvm_cpuid_entry2 *cpuid;
+
+ if (!kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM) &&
+ !kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) {
+ print_skip("Capabilities not available");
+ exit(KSFT_SKIP);
+ }
+
+ cpuid = kvm_get_supported_cpuid_entry(0x80000000);
+ if (cpuid->eax < 0x8000001f) {
+ print_skip("AMD memory encryption not available");
+ exit(KSFT_SKIP);
+ }
+ cpuid = kvm_get_supported_cpuid_entry(0x8000001f);
+ if (!(cpuid->eax & X86_FEATURE_SEV)) {
+ print_skip("AMD SEV not available");
+ exit(KSFT_SKIP);
+ }
+ have_sev_es = !!(cpuid->eax & X86_FEATURE_SEV_ES);
+
if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) {
test_sev_migrate_from(/* es= */ false);
- test_sev_migrate_from(/* es= */ true);
+ if (have_sev_es)
+ test_sev_migrate_from(/* es= */ true);
test_sev_migrate_locking();
test_sev_migrate_parameters();
if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM))
@@ -386,7 +440,8 @@ int main(int argc, char *argv[])
}
if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) {
test_sev_mirror(/* es= */ false);
- test_sev_mirror(/* es= */ true);
+ if (have_sev_es)
+ test_sev_mirror(/* es= */ true);
test_sev_mirror_parameters();
}
return 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 58d31da8a2f7..83c57bcc6eb6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -246,9 +246,8 @@ static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
return true;
}
-static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu,
- unsigned int req, struct cpumask *tmp,
- int current_cpu)
+static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
+ struct cpumask *tmp, int current_cpu)
{
int cpu;
@@ -291,7 +290,7 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
vcpu = kvm_get_vcpu(kvm, i);
if (!vcpu)
continue;
- kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
+ kvm_make_vcpu_request(vcpu, req, cpus, me);
}
called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
@@ -317,7 +316,7 @@ bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu == except)
continue;
- kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
+ kvm_make_vcpu_request(vcpu, req, cpus, me);
}
called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));