diff options
58 files changed, 2017 insertions, 1355 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f5a27f067db9..05161afd7642 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2339,13 +2339,35 @@ kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. Default is 0 (don't ignore, but inject #GP) + kvm.eager_page_split= + [KVM,X86] Controls whether or not KVM will try to + proactively split all huge pages during dirty logging. + Eager page splitting reduces interruptions to vCPU + execution by eliminating the write-protection faults + and MMU lock contention that would otherwise be + required to split huge pages lazily. + + VM workloads that rarely perform writes or that write + only to a small region of VM memory may benefit from + disabling eager page splitting to allow huge pages to + still be used for reads. + + The behavior of eager page splitting depends on whether + KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If + disabled, all huge pages in a memslot will be eagerly + split when dirty logging is enabled on that memslot. If + enabled, eager page splitting will be performed during + the KVM_CLEAR_DIRTY ioctl, and only for the pages being + cleared. + + Eager page splitting currently only supports splitting + huge pages mapped by the TDP MMU. + + Default is Y (on). + kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface. Default is false (don't support). - kvm.mmu_audit= [KVM] This is a R/W parameter which allows audit - KVM MMU at runtime. - Default is 0 (off) - kvm.nx_huge_pages= [KVM] Controls the software workaround for the X86_BUG_ITLB_MULTIHIT bug. diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index 7068da080799..49837d3a3ef5 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -248,6 +248,8 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, IRQCHIP_STATE_PENDING, &val); WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + } else if (vgic_irq_is_mapped_level(irq)) { + val = vgic_get_phys_line_level(irq); } else { val = irq_is_pending(irq); } diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index d39e0de06be2..29affccb353c 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -1,29 +1,30 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL) +#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_OPTIONAL) BUILD_BUG_ON(1) #endif /* - * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate - * "static_call()"s. They are also intended for use when defining - * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those - * functions that follow the [svm|vmx]_func_name convention. - * KVM_X86_OP_NULL() can leave a NULL definition for the - * case where there is no definition or a function name that - * doesn't match the typical naming convention is supplied. + * KVM_X86_OP() and KVM_X86_OP_OPTIONAL() are used to help generate + * both DECLARE/DEFINE_STATIC_CALL() invocations and + * "static_call_update()" calls. + * + * KVM_X86_OP_OPTIONAL() can be used for those functions that can have + * a NULL definition, for example if "static_call_cond()" will be used + * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise + * to make a definition optional, but in this case the default will + * be __static_call_return0. */ -KVM_X86_OP_NULL(hardware_enable) -KVM_X86_OP_NULL(hardware_disable) -KVM_X86_OP_NULL(hardware_unsetup) -KVM_X86_OP_NULL(cpu_has_accelerated_tpr) +KVM_X86_OP(hardware_enable) +KVM_X86_OP(hardware_disable) +KVM_X86_OP(hardware_unsetup) KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) -KVM_X86_OP_NULL(vm_destroy) +KVM_X86_OP_OPTIONAL(vm_destroy) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) KVM_X86_OP(vcpu_reset) -KVM_X86_OP(prepare_guest_switch) +KVM_X86_OP(prepare_switch_to_guest) KVM_X86_OP(vcpu_load) KVM_X86_OP(vcpu_put) KVM_X86_OP(update_exception_bitmap) @@ -33,9 +34,9 @@ KVM_X86_OP(get_segment_base) KVM_X86_OP(get_segment) KVM_X86_OP(get_cpl) KVM_X86_OP(set_segment) -KVM_X86_OP_NULL(get_cs_db_l_bits) +KVM_X86_OP(get_cs_db_l_bits) KVM_X86_OP(set_cr0) -KVM_X86_OP_NULL(post_set_cr3) +KVM_X86_OP_OPTIONAL(post_set_cr3) KVM_X86_OP(is_valid_cr4) KVM_X86_OP(set_cr4) KVM_X86_OP(set_efer) @@ -49,22 +50,22 @@ KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) -KVM_X86_OP(tlb_flush_all) -KVM_X86_OP(tlb_flush_current) -KVM_X86_OP_NULL(tlb_remote_flush) -KVM_X86_OP_NULL(tlb_remote_flush_with_range) -KVM_X86_OP(tlb_flush_gva) -KVM_X86_OP(tlb_flush_guest) +KVM_X86_OP(flush_tlb_all) +KVM_X86_OP(flush_tlb_current) +KVM_X86_OP_OPTIONAL(tlb_remote_flush) +KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range) +KVM_X86_OP(flush_tlb_gva) +KVM_X86_OP(flush_tlb_guest) KVM_X86_OP(vcpu_pre_run) -KVM_X86_OP(run) -KVM_X86_OP_NULL(handle_exit) -KVM_X86_OP_NULL(skip_emulated_instruction) -KVM_X86_OP_NULL(update_emulated_instruction) +KVM_X86_OP(vcpu_run) +KVM_X86_OP(handle_exit) +KVM_X86_OP(skip_emulated_instruction) +KVM_X86_OP_OPTIONAL(update_emulated_instruction) KVM_X86_OP(set_interrupt_shadow) KVM_X86_OP(get_interrupt_shadow) KVM_X86_OP(patch_hypercall) -KVM_X86_OP(set_irq) -KVM_X86_OP(set_nmi) +KVM_X86_OP(inject_irq) +KVM_X86_OP(inject_nmi) KVM_X86_OP(queue_exception) KVM_X86_OP(cancel_injection) KVM_X86_OP(interrupt_allowed) @@ -73,22 +74,22 @@ KVM_X86_OP(get_nmi_mask) KVM_X86_OP(set_nmi_mask) KVM_X86_OP(enable_nmi_window) KVM_X86_OP(enable_irq_window) -KVM_X86_OP(update_cr8_intercept) +KVM_X86_OP_OPTIONAL(update_cr8_intercept) KVM_X86_OP(check_apicv_inhibit_reasons) KVM_X86_OP(refresh_apicv_exec_ctrl) -KVM_X86_OP(hwapic_irr_update) -KVM_X86_OP(hwapic_isr_update) -KVM_X86_OP_NULL(guest_apic_has_interrupt) -KVM_X86_OP(load_eoi_exitmap) -KVM_X86_OP(set_virtual_apic_mode) -KVM_X86_OP_NULL(set_apic_access_page_addr) +KVM_X86_OP_OPTIONAL(hwapic_irr_update) +KVM_X86_OP_OPTIONAL(hwapic_isr_update) +KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt) +KVM_X86_OP_OPTIONAL(load_eoi_exitmap) +KVM_X86_OP_OPTIONAL(set_virtual_apic_mode) +KVM_X86_OP_OPTIONAL(set_apic_access_page_addr) KVM_X86_OP(deliver_interrupt) -KVM_X86_OP_NULL(sync_pir_to_irr) -KVM_X86_OP(set_tss_addr) -KVM_X86_OP(set_identity_map_addr) -KVM_X86_OP(get_mt_mask) +KVM_X86_OP_OPTIONAL(sync_pir_to_irr) +KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) +KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) +KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) -KVM_X86_OP_NULL(has_wbinvd_exit) +KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) KVM_X86_OP(get_l2_tsc_multiplier) KVM_X86_OP(write_tsc_offset) @@ -96,32 +97,36 @@ KVM_X86_OP(write_tsc_multiplier) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) -KVM_X86_OP_NULL(request_immediate_exit) +KVM_X86_OP(request_immediate_exit) KVM_X86_OP(sched_in) -KVM_X86_OP_NULL(update_cpu_dirty_logging) -KVM_X86_OP_NULL(vcpu_blocking) -KVM_X86_OP_NULL(vcpu_unblocking) -KVM_X86_OP_NULL(update_pi_irte) -KVM_X86_OP_NULL(start_assignment) -KVM_X86_OP_NULL(apicv_post_state_restore) -KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) -KVM_X86_OP_NULL(set_hv_timer) -KVM_X86_OP_NULL(cancel_hv_timer) +KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) +KVM_X86_OP_OPTIONAL(vcpu_blocking) +KVM_X86_OP_OPTIONAL(vcpu_unblocking) +KVM_X86_OP_OPTIONAL(pi_update_irte) +KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP_OPTIONAL(apicv_post_state_restore) +KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) +KVM_X86_OP_OPTIONAL(set_hv_timer) +KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) KVM_X86_OP(smi_allowed) KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) -KVM_X86_OP_NULL(mem_enc_op) -KVM_X86_OP_NULL(mem_enc_reg_region) -KVM_X86_OP_NULL(mem_enc_unreg_region) +KVM_X86_OP_OPTIONAL(mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(mem_enc_register_region) +KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) +KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) +KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) KVM_X86_OP(get_msr_feature) KVM_X86_OP(can_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) -KVM_X86_OP_NULL(enable_direct_tlbflush) -KVM_X86_OP_NULL(migrate_timers) +KVM_X86_OP_OPTIONAL(enable_direct_tlbflush) +KVM_X86_OP_OPTIONAL(migrate_timers) KVM_X86_OP(msr_filter_changed) -KVM_X86_OP_NULL(complete_emulated_msr) +KVM_X86_OP(complete_emulated_msr) +KVM_X86_OP(vcpu_deliver_sipi_vector) #undef KVM_X86_OP -#undef KVM_X86_OP_NULL +#undef KVM_X86_OP_OPTIONAL +#undef KVM_X86_OP_OPTIONAL_RET0 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6dcccb304775..884c926e4359 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1128,10 +1128,6 @@ struct kvm_arch { struct kvm_hv hyperv; struct kvm_xen xen; - #ifdef CONFIG_KVM_MMU_AUDIT - int audit_point; - #endif - bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; u32 bsp_vcpu_id; @@ -1318,7 +1314,6 @@ struct kvm_x86_ops { int (*hardware_enable)(void); void (*hardware_disable)(void); void (*hardware_unsetup)(void); - bool (*cpu_has_accelerated_tpr)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); @@ -1331,7 +1326,7 @@ struct kvm_x86_ops { void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); - void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); + void (*prepare_switch_to_guest)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); @@ -1361,8 +1356,8 @@ struct kvm_x86_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); bool (*get_if_flag)(struct kvm_vcpu *vcpu); - void (*tlb_flush_all)(struct kvm_vcpu *vcpu); - void (*tlb_flush_current)(struct kvm_vcpu *vcpu); + void (*flush_tlb_all)(struct kvm_vcpu *vcpu); + void (*flush_tlb_current)(struct kvm_vcpu *vcpu); int (*tlb_remote_flush)(struct kvm *kvm); int (*tlb_remote_flush_with_range)(struct kvm *kvm, struct kvm_tlb_range *range); @@ -1373,16 +1368,16 @@ struct kvm_x86_ops { * Can potentially get non-canonical addresses through INVLPGs, which * the implementation may choose to ignore if appropriate. */ - void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); + void (*flush_tlb_gva)(struct kvm_vcpu *vcpu, gva_t addr); /* * Flush any TLB entries created by the guest. Like tlb_flush_gva(), * does not need to flush GPA->HPA mappings. */ - void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); + void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); - enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1391,8 +1386,8 @@ struct kvm_x86_ops { u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - void (*set_irq)(struct kvm_vcpu *vcpu); - void (*set_nmi)(struct kvm_vcpu *vcpu); + void (*inject_irq)(struct kvm_vcpu *vcpu); + void (*inject_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); @@ -1459,9 +1454,9 @@ struct kvm_x86_ops { void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); - int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, + int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); - void (*start_assignment)(struct kvm *kvm); + void (*pi_start_assignment)(struct kvm *kvm); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); @@ -1476,9 +1471,9 @@ struct kvm_x86_ops { int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); void (*enable_smi_window)(struct kvm_vcpu *vcpu); - int (*mem_enc_op)(struct kvm *kvm, void __user *argp); - int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); - int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); + int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); + int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); + int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); @@ -1541,15 +1536,22 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> static inline void kvm_ops_static_call_update(void) { -#define KVM_X86_OP(func) \ +#define __KVM_X86_OP(func) \ static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func ? : \ + (void *) __static_call_return0); #include <asm/kvm-x86-ops.h> +#undef __KVM_X86_OP } #define __KVM_HAVE_ARCH_VM_ALLOC @@ -1587,6 +1589,13 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level); +void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + int target_level); +void kvm_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + u64 start, u64 end, + int target_level); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, @@ -1724,7 +1733,6 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); @@ -1878,7 +1886,7 @@ static inline bool kvm_is_supported_user_return_msr(u32 msr) return kvm_find_user_return_msr(msr) >= 0; } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio); +u64 kvm_scale_tsc(u64 tsc, u64 ratio); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3faf0f97edb1..a4a39c3e0f19 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -476,6 +476,7 @@ #define MSR_AMD64_ICIBSEXTDCTL 0xc001103c #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b #define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index b00dbc5fac2b..bb2fb78523ce 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -220,6 +220,42 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) #define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) + +/* AVIC */ +#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 +#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) + +#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) +#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) +#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) +#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) +#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFF) + +#define AVIC_DOORBELL_PHYSICAL_ID_MASK (0xFF) + +#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 +#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 +#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF + +enum avic_ipi_failure_cause { + AVIC_IPI_FAILURE_INVALID_INT_TYPE, + AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, + AVIC_IPI_FAILURE_INVALID_TARGET, + AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, +}; + + +/* + * 0xff is broadcast, so the max index allowed for physical APIC ID + * table is 0xfe. APIC IDs above 0xff are reserved. + */ +#define AVIC_MAX_PHYSICAL_ID_COUNT 0xff + +#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL + + struct vmcb_seg { u16 selector; u16 attrib; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2b1548da00eb..e3cbd7706136 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -126,13 +126,6 @@ config KVM_XEN If in doubt, say "N". -config KVM_MMU_AUDIT - bool "Audit KVM MMU" - depends on KVM && TRACEPOINTS - help - This option adds a R/W kVM module parameter 'mmu_audit', which allows - auditing of KVM MMU events at runtime. - config KVM_EXTERNAL_WRITE_TRACKING bool diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 494d4d351859..ff756cdc31ce 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -712,9 +712,17 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, entry = &array->entries[array->nent++]; + memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; - entry->flags = 0; + switch (function & 0xC0000000) { + case 0x40000000: + /* Hypervisor leaves are always synthesized by __do_cpuid_func. */ + return entry; + + default: + break; + } cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5719d8cfdbd9..b7990defca9c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2600,8 +2600,7 @@ emulate_shutdown: } static void -setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, - struct desc_struct *cs, struct desc_struct *ss) +setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss) { cs->l = 0; /* will be adjusted later */ set_desc_base(cs, 0); /* flat segment */ @@ -2690,7 +2689,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) if (!(efer & EFER_SCE)) return emulate_ud(ctxt); - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(&cs, &ss); ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); @@ -2758,7 +2757,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if ((msr_data & 0xfffc) == 0x0) return emulate_gp(ctxt, 0); - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(&cs, &ss); ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK; ss_sel = cs_sel + 8; @@ -2795,7 +2794,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ctxt->mode == X86EMUL_MODE_VM86) return emulate_gp(ctxt, 0); - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(&cs, &ss); if ((ctxt->rex_prefix & 0x8) != 0x0) usermode = X86EMUL_MODE_PROT64; @@ -3013,8 +3012,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static int task_switch_16(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, u16 old_tss_sel, +static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { struct tss_segment_16 tss_seg; @@ -3152,8 +3150,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, return ret; } -static int task_switch_32(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, u16 old_tss_sel, +static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { struct tss_segment_32 tss_seg; @@ -3261,10 +3258,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, old_tss_sel = 0xffff; if (next_tss_desc.type & 8) - ret = task_switch_32(ctxt, tss_selector, old_tss_sel, - old_tss_base, &next_tss_desc); + ret = task_switch_32(ctxt, old_tss_sel, old_tss_base, &next_tss_desc); else - ret = task_switch_16(ctxt, tss_selector, old_tss_sel, + ret = task_switch_16(ctxt, old_tss_sel, old_tss_base, &next_tss_desc); if (ret != X86EMUL_CONTINUE) return ret; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6e38a7d22e97..653e08c993c4 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -112,6 +112,9 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, if (!!auto_eoi_old == !!auto_eoi_new) return; + if (!enable_apicv) + return; + down_write(&vcpu->kvm->arch.apicv_update_lock); if (auto_eoi_new) @@ -1710,32 +1713,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) return kvm_hv_get_msr(vcpu, msr, pdata, host); } -static __always_inline unsigned long *sparse_set_to_vcpu_mask( - struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, - u64 *vp_bitmap, unsigned long *vcpu_bitmap) +static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks, + u64 valid_bank_mask, unsigned long *vcpu_mask) { struct kvm_hv *hv = to_kvm_hv(kvm); + bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes); + u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; struct kvm_vcpu *vcpu; int bank, sbank = 0; unsigned long i; + u64 *bitmap; + + BUILD_BUG_ON(sizeof(vp_bitmap) > + sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS)); - memset(vp_bitmap, 0, - KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap)); + /* + * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else + * fill a temporary buffer and manually test each vCPU's VP index. + */ + if (likely(!has_mismatch)) + bitmap = (u64 *)vcpu_mask; + else + bitmap = vp_bitmap; + + /* + * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask + * having a '1' for each bank that exists in sparse_banks. Sets must + * be in ascending order, i.e. bank0..bankN. + */ + memset(bitmap, 0, sizeof(vp_bitmap)); for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, KVM_HV_MAX_SPARSE_VCPU_SET_BITS) - vp_bitmap[bank] = sparse_banks[sbank++]; + bitmap[bank] = sparse_banks[sbank++]; - if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) { - /* for all vcpus vp_index == vcpu_idx */ - return (unsigned long *)vp_bitmap; - } + if (likely(!has_mismatch)) + return; - bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); + bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap)) - __set_bit(i, vcpu_bitmap); + __set_bit(i, vcpu_mask); } - return vcpu_bitmap; } struct kvm_hv_hcall { @@ -1743,6 +1761,7 @@ struct kvm_hv_hcall { u64 ingpa; u64 outgpa; u16 code; + u16 var_cnt; u16 rep_cnt; u16 rep_idx; bool fast; @@ -1750,21 +1769,40 @@ struct kvm_hv_hcall { sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; }; +static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, + u64 *sparse_banks, gpa_t offset) +{ + u16 var_cnt; + + if (hc->var_cnt > 64) + return -EINVAL; + + /* Ignore banks that cannot possibly contain a legal VP index. */ + var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS); + + return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks, + var_cnt * sizeof(*sparse_banks)); +} + static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex) { int i; - gpa_t gpa; struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; - u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; - DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); - unsigned long *vcpu_mask; + DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); u64 valid_bank_mask; - u64 sparse_banks[64]; - int sparse_banks_len; + u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; bool all_cpus; + /* + * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the + * valid mask is a u64. Fail the build if KVM's max allowed number of + * vCPUs (>4096) would exceed this limit, KVM will additional changes + * for Hyper-V support to avoid setting the guest up to fail. + */ + BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64); + if (!ex) { if (hc->fast) { flush.address_space = hc->ingpa; @@ -1812,30 +1850,32 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; - sparse_banks_len = bitmap_weight((unsigned long *)&valid_bank_mask, 64); + if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + if (all_cpus) + goto do_flush; - if (!sparse_banks_len && !all_cpus) + if (!hc->var_cnt) goto ret_success; - if (!all_cpus) { - if (hc->fast) { - if (sparse_banks_len > HV_HYPERCALL_MAX_XMM_REGISTERS - 1) - return HV_STATUS_INVALID_HYPERCALL_INPUT; - for (i = 0; i < sparse_banks_len; i += 2) { - sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]); - sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]); - } - } else { - gpa = hc->ingpa + offsetof(struct hv_tlb_flush_ex, - hv_vp_set.bank_contents); - if (unlikely(kvm_read_guest(kvm, gpa, sparse_banks, - sparse_banks_len * - sizeof(sparse_banks[0])))) - return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->fast) { + if (hc->var_cnt > HV_HYPERCALL_MAX_XMM_REGISTERS - 1) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + for (i = 0; i < hc->var_cnt; i += 2) { + sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]); + sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]); } + goto do_flush; } + + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, + offsetof(struct hv_tlb_flush_ex, + hv_vp_set.bank_contents))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; } +do_flush: /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. @@ -1843,11 +1883,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool if (all_cpus) { kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST); } else { - vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, - vcpu_mask); + kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask); } ret_success: @@ -1880,12 +1918,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; - u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; - DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); - unsigned long *vcpu_mask; + DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); unsigned long valid_bank_mask; - u64 sparse_banks[64]; - int sparse_banks_len; + u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; u32 vector; bool all_cpus; @@ -1918,22 +1953,20 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool vector = send_ipi_ex.vector; valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; - sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) * - sizeof(sparse_banks[0]); - all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; + if (hc->var_cnt != bitmap_weight(&valid_bank_mask, 64)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (all_cpus) goto check_and_send_ipi; - if (!sparse_banks_len) + if (!hc->var_cnt) goto ret_success; - if (kvm_read_guest(kvm, - hc->ingpa + offsetof(struct hv_send_ipi_ex, - vp_set.bank_contents), - sparse_banks, - sparse_banks_len)) + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, + offsetof(struct hv_send_ipi_ex, + vp_set.bank_contents))) return HV_STATUS_INVALID_HYPERCALL_INPUT; } @@ -1941,11 +1974,13 @@ check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - vcpu_mask = all_cpus ? NULL : - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); + if (all_cpus) { + kvm_send_ipi_to_many(kvm, vector, NULL); + } else { + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - kvm_send_ipi_to_many(kvm, vector, vcpu_mask); + kvm_send_ipi_to_many(kvm, vector, vcpu_mask); + } ret_success: return HV_STATUS_SUCCESS; @@ -2017,11 +2052,6 @@ int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce) return ret; } -bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id; -} - static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { bool longmode; @@ -2191,19 +2221,25 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } hc.code = hc.param & 0xffff; + hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET; hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT); hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; hc.rep = !!(hc.rep_cnt || hc.rep_idx); - trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx, - hc.ingpa, hc.outgpa); + trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt, + hc.rep_idx, hc.ingpa, hc.outgpa); if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) { ret = HV_STATUS_ACCESS_DENIED; goto hypercall_complete; } + if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + goto hypercall_complete; + } + if (hc.fast && is_xmm_fast_hypercall(&hc)) { if (unlikely(hv_vcpu->enforce_cpuid && !(hv_vcpu->cpuid_cache.features_edx & @@ -2217,14 +2253,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: - if (unlikely(hc.rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_SIGNAL_EVENT: - if (unlikely(hc.rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -2234,7 +2270,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ - if (unlikely(hc.rep || !to_hv_synic(vcpu)->active)) { + if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -2247,14 +2283,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) kvm_hv_hypercall_complete_userspace; return 0; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: - if (unlikely(!hc.rep_cnt || hc.rep_idx)) { + if (unlikely(!hc.rep_cnt || hc.rep_idx || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc, false); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: - if (unlikely(hc.rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -2275,7 +2311,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = kvm_hv_flush_tlb(vcpu, &hc, true); break; case HVCALL_SEND_IPI: - if (unlikely(hc.rep)) { + if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -2417,10 +2453,6 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, if (kvm_x86_ops.nested_ops->get_evmcs_version) evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); - /* Skip NESTED_FEATURES if eVMCS is not supported */ - if (!evmcs_ver) - --nent; - if (cpuid->nent < nent) return -E2BIG; @@ -2520,8 +2552,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; - if (evmcs_ver) - ent->eax |= HV_X64_NESTED_MSR_BITMAP; + ent->eax |= HV_X64_NESTED_MSR_BITMAP; break; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index ed1c4e546d04..e19c00ee9ab3 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -89,7 +89,11 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu) int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); -bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu); +static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id; +} + int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 814064d06016..be99dc86293d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -437,13 +437,13 @@ static u32 pic_ioport_read(void *opaque, u32 addr) return ret; } -static void elcr_ioport_write(void *opaque, u32 addr, u32 val) +static void elcr_ioport_write(void *opaque, u32 val) { struct kvm_kpic_state *s = opaque; s->elcr = val & s->elcr_mask; } -static u32 elcr_ioport_read(void *opaque, u32 addr1) +static u32 elcr_ioport_read(void *opaque) { struct kvm_kpic_state *s = opaque; return s->elcr; @@ -474,7 +474,7 @@ static int picdev_write(struct kvm_pic *s, case 0x4d0: case 0x4d1: pic_lock(s); - elcr_ioport_write(&s->pics[addr & 1], addr, data); + elcr_ioport_write(&s->pics[addr & 1], data); pic_unlock(s); break; default: @@ -505,7 +505,7 @@ static int picdev_read(struct kvm_pic *s, case 0x4d0: case 0x4d1: pic_lock(s); - *data = elcr_ioport_read(&s->pics[addr & 1], addr); + *data = elcr_ioport_read(&s->pics[addr & 1]); pic_unlock(s); break; default: diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index decfa36b7891..765943d7cfa5 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -54,9 +54,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, int trigger_mode, int pin); -static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, - unsigned long addr, - unsigned long length) +static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic) { unsigned long result = 0; @@ -593,7 +591,7 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, break; case IOAPIC_REG_WINDOW: - result = ioapic_read_indirect(ioapic, addr, len); + result = ioapic_read_indirect(ioapic); break; default: diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index b469f45e3fe4..ee4f696a0782 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -92,3 +92,17 @@ int hv_remote_flush_tlb(struct kvm *kvm) return hv_remote_flush_tlb_with_range(kvm, NULL); } EXPORT_SYMBOL_GPL(hv_remote_flush_tlb); + +void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) +{ + struct kvm_arch *kvm_arch = &vcpu->kvm->arch; + + if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { + spin_lock(&kvm_arch->hv_root_tdp_lock); + vcpu->arch.hv_root_tdp = root_tdp; + if (root_tdp != kvm_arch->hv_root_tdp) + kvm_arch->hv_root_tdp = INVALID_PAGE; + spin_unlock(&kvm_arch->hv_root_tdp_lock); + } +} +EXPORT_SYMBOL_GPL(hv_track_root_tdp); diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h index 1c67abf2eba9..287e98ef9df3 100644 --- a/arch/x86/kvm/kvm_onhyperv.h +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -10,19 +10,7 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm, struct kvm_tlb_range *range); int hv_remote_flush_tlb(struct kvm *kvm); - -static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) -{ - struct kvm_arch *kvm_arch = &vcpu->kvm->arch; - - if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { - spin_lock(&kvm_arch->hv_root_tdp_lock); - vcpu->arch.hv_root_tdp = root_tdp; - if (root_tdp != kvm_arch->hv_root_tdp) - kvm_arch->hv_root_tdp = INVALID_PAGE; - spin_unlock(&kvm_arch->hv_root_tdp_lock); - } -} +void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp); #else /* !CONFIG_HYPERV */ static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d7e6fde82d25..47f8606559a9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) { - return pi_inject_timer && kvm_vcpu_apicv_active(vcpu); + return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) && + (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); } bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) @@ -491,8 +492,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) if (unlikely(vcpu->arch.apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - static_call(kvm_x86_hwapic_irr_update)(vcpu, - apic_find_highest_irr(apic)); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); @@ -522,7 +522,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(vcpu->arch.apicv_active)) - static_call(kvm_x86_hwapic_isr_update)(vcpu, vec); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -570,8 +570,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(vcpu->arch.apicv_active)) - static_call(kvm_x86_hwapic_isr_update)(vcpu, - apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -2287,7 +2286,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) - static_call(kvm_x86_set_virtual_apic_mode)(vcpu); + static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -2306,7 +2305,12 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) apic->irr_pending = true; apic->isr_count = 1; } else { - apic->irr_pending = (apic_search_irr(apic) != -1); + /* + * Don't clear irr_pending, searching the IRR can race with + * updates from the CPU as APICv is still active from hardware's + * perspective. The flag will be cleared as appropriate when + * KVM injects the interrupt. + */ apic->isr_count = count_vectors(apic->regs + APIC_ISR); } } @@ -2368,9 +2372,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { - static_call(kvm_x86_apicv_post_state_restore)(vcpu); - static_call(kvm_x86_hwapic_irr_update)(vcpu, -1); - static_call(kvm_x86_hwapic_isr_update)(vcpu, -1); + static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1); } vcpu->arch.apic_arb_prio = 0; @@ -2633,11 +2637,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { - static_call(kvm_x86_apicv_post_state_restore)(vcpu); - static_call(kvm_x86_hwapic_irr_update)(vcpu, - apic_find_highest_irr(apic)); - static_call(kvm_x86_hwapic_isr_update)(vcpu, - apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); + static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) @@ -2928,7 +2930,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector); + static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e9fbb2c8bbe2..51faa2c76ca5 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -203,44 +203,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } /* - * Currently, we have two sorts of write-protection, a) the first one - * write-protects guest page to sync the guest modification, b) another one is - * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences - * between these two sorts are: - * 1) the first case clears MMU-writable bit. - * 2) the first case requires flushing tlb immediately avoiding corrupting - * shadow page table between all vcpus so it should be in the protection of - * mmu-lock. And the another case does not need to flush tlb until returning - * the dirty bitmap to userspace since it only write-protects the page - * logged in the bitmap, that means the page in the dirty bitmap is not - * missed, so it can flush tlb out of mmu-lock. - * - * So, there is the problem: the first case can meet the corrupted tlb caused - * by another case which write-protects pages but without flush tlb - * immediately. In order to making the first case be aware this problem we let - * it flush tlb if we try to write-protect a spte whose MMU-writable bit - * is set, it works since another case never touches MMU-writable bit. - * - * Anyway, whenever a spte is updated (only permission and status bits are - * changed) we need to check whether the spte with MMU-writable becomes - * readonly, if that happens, we need to flush tlb. Fortunately, - * mmu_spte_update() has already handled it perfectly. - * - * The rules to use MMU-writable and PT_WRITABLE_MASK: - * - if we want to see if it has writable tlb entry or if the spte can be - * writable on the mmu mapping, check MMU-writable, this is the most - * case, otherwise - * - if we fix page fault on the spte or do write-protection by dirty logging, - * check PT_WRITABLE_MASK. - * - * TODO: introduce APIs to split these two cases. - */ -static inline bool is_writable_pte(unsigned long pte) -{ - return pte & PT_WRITABLE_MASK; -} - -/* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE * access rights (in ACC_* format). diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 593093b52395..0620480b99e0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -104,15 +104,6 @@ static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; -enum { - AUDIT_PRE_PAGE_FAULT, - AUDIT_POST_PAGE_FAULT, - AUDIT_PRE_PTE_WRITE, - AUDIT_POST_PTE_WRITE, - AUDIT_PRE_SYNC, - AUDIT_POST_SYNC -}; - #ifdef MMU_DEBUG bool dbg = 0; module_param(dbg, bool, 0644); @@ -529,6 +520,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) u64 old_spte = *sptep; WARN_ON(!is_shadow_present_pte(new_spte)); + check_spte_writable_invariants(new_spte); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); @@ -548,11 +540,9 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) /* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changed. * - * Whenever we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB, the return value indicates this - * case. + * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote + * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only + * spte, even though the writable spte might be cached on a CPU's TLB. * * Returns true if the TLB needs to be flushed */ @@ -646,24 +636,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -/* Restore an acc-track PTE back to a regular PTE */ -static u64 restore_acc_track_spte(u64 spte) -{ - u64 new_spte = spte; - u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) - & SHADOW_ACC_TRACK_SAVED_BITS_MASK; - - WARN_ON_ONCE(spte_ad_enabled(spte)); - WARN_ON_ONCE(!is_access_track_spte(spte)); - - new_spte &= ~shadow_acc_track_mask; - new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << - SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); - new_spte |= saved_bits; - - return new_spte; -} - /* Returns the Accessed status of the PTE and resets it at the same time. */ static bool mmu_spte_age(u64 *sptep) { @@ -1229,9 +1201,8 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) return mmu_spte_update(sptep, spte); } -static bool __rmap_write_protect(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - bool pt_protect) +static bool rmap_write_protect(struct kvm_rmap_head *rmap_head, + bool pt_protect) { u64 *sptep; struct rmap_iterator iter; @@ -1311,7 +1282,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); - __rmap_write_protect(kvm, rmap_head, false); + rmap_write_protect(rmap_head, false); /* clear the first set bit */ mask &= mask - 1; @@ -1378,6 +1349,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); + if (READ_ONCE(eager_page_split)) + kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K); + kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); /* Cross two large pages? */ @@ -1410,7 +1384,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmap_head, true); + write_protected |= rmap_write_protect(rmap_head, true); } } @@ -1421,7 +1395,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, return write_protected; } -static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) +static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; @@ -1921,13 +1895,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, return true; } -#ifdef CONFIG_KVM_MMU_AUDIT -#include "mmu_audit.c" -#else -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } -static void mmu_audit_disable(void) { } -#endif - static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->role.invalid) @@ -2024,7 +1991,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, bool protected = false; for_each_sp(pages, sp, parents, i) - protected |= rmap_write_protect(vcpu, sp->gfn); + protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn); if (protected) { kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); @@ -2149,7 +2116,7 @@ trace_get_page: hlist_add_head(&sp->hash_link, sp_list); if (!direct) { account_shadowed(vcpu->kvm, sp); - if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) + if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); } trace_kvm_mmu_get_page(sp, true); @@ -2307,7 +2274,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, return zapped; } -static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; @@ -2351,7 +2318,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); - kvm_mmu_unlink_parents(kvm, sp); + kvm_mmu_unlink_parents(sp); /* Zapping children means active_mmu_pages has become unstable. */ list_unstable = *nr_zapped; @@ -3687,17 +3654,12 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) return; write_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - mmu_sync_children(vcpu, sp, true); - - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); write_unlock(&vcpu->kvm->mmu_lock); return; } write_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu->pae_root[i]; @@ -3709,7 +3671,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } } - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); write_unlock(&vcpu->kvm->mmu_lock); } @@ -4474,8 +4435,7 @@ static inline bool boot_cpu_is_amd(void) * possible, however, kvm currently does not do execution-protection. */ static void -reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) { struct rsvd_bits_validate *shadow_zero_check; int i; @@ -4506,8 +4466,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, * is the shadow page table for intel nested guest. */ static void -reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly) +reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, reserved_hpa_bits(), execonly, @@ -4794,7 +4753,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, context); - reset_tdp_shadow_zero_bits_mask(vcpu, context); + reset_tdp_shadow_zero_bits_mask(context); } static union kvm_mmu_role @@ -4948,7 +4907,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, update_permission_bitmask(context, true); context->pkru_mask = 0; reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); - reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); + reset_ept_shadow_zero_bits_mask(context, execonly); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); @@ -5100,7 +5059,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); kvm_mmu_load_pgd(vcpu); - static_call(kvm_x86_tlb_flush_current)(vcpu); + static_call(kvm_x86_flush_tlb_current)(vcpu); out: return r; } @@ -5260,7 +5219,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); ++vcpu->kvm->stat.mmu_pte_write; - kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || @@ -5285,7 +5243,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); - kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); write_unlock(&vcpu->kvm->mmu_lock); } @@ -5360,7 +5317,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (is_noncanonical_address(gva, vcpu)) return; - static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); } if (!mmu->invlpg) @@ -5416,7 +5373,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } if (tlb_flush) - static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); ++vcpu->stat.invlpg; @@ -5802,7 +5759,7 @@ static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { - return __rmap_write_protect(kvm, rmap_head, false); + return rmap_write_protect(rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -5846,12 +5803,52 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * will clear a separate software-only bit (MMU-writable) and skip the * flush if-and-only-if this bit was already clear. * - * See DEFAULT_SPTE_MMU_WRITEABLE for more details. + * See is_writable_pte() for more details. */ if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } +/* Must be called with the mmu_lock held in write-mode. */ +void kvm_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + u64 start, u64 end, + int target_level) +{ + if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, + target_level, false); + + /* + * A TLB flush is unnecessary at this point for the same resons as in + * kvm_mmu_slot_try_split_huge_pages(). + */ +} + +void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + int target_level) +{ + u64 start = memslot->base_gfn; + u64 end = start + memslot->npages; + + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); + read_unlock(&kvm->mmu_lock); + } + + /* + * No TLB flush is necessary here. KVM will flush TLBs after + * write-protecting and/or clearing dirty on the newly split SPTEs to + * ensure that guest writes are reflected in the dirty log before the + * ioctl to enable dirty logging on this memslot completes. Since the + * split SPTEs retain the write and dirty bits of the huge SPTE, it is + * safe for KVM to decide if a TLB flush is necessary based on the split + * SPTEs. + */ +} + static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) @@ -6191,7 +6188,6 @@ void kvm_mmu_module_exit(void) mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); unregister_shrinker(&mmu_shrinker); - mmu_audit_disable(); } /* diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c deleted file mode 100644 index 9e7dcf999f08..000000000000 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ /dev/null @@ -1,303 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * mmu_audit.c: - * - * Audit code for KVM MMU - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Yaniv Kamay <[email protected]> - * Avi Kivity <[email protected]> - * Marcelo Tosatti <[email protected]> - * Xiao Guangrong <[email protected]> - */ - -#include <linux/ratelimit.h> - -static char const *audit_point_name[] = { - "pre page fault", - "post page fault", - "pre pte write", - "post pte write", - "pre sync", - "post sync" -}; - -#define audit_printk(kvm, fmt, args...) \ - printk(KERN_ERR "audit: (%s) error: " \ - fmt, audit_point_name[kvm->arch.audit_point], ##args) - -typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); - -static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - inspect_spte_fn fn, int level) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 *ent = sp->spt; - - fn(vcpu, ent + i, level); - - if (is_shadow_present_pte(ent[i]) && - !is_last_spte(ent[i], level)) { - struct kvm_mmu_page *child; - - child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK); - __mmu_spte_walk(vcpu, child, fn, level - 1); - } - } -} - -static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) -{ - int i; - struct kvm_mmu_page *sp; - - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - - if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu->root_hpa; - - sp = to_shadow_page(root); - __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level); - return; - } - - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu->pae_root[i]; - - if (IS_VALID_PAE_ROOT(root)) { - root &= PT64_BASE_ADDR_MASK; - sp = to_shadow_page(root); - __mmu_spte_walk(vcpu, sp, fn, 2); - } - } - - return; -} - -typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); - -static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) - fn(kvm, sp); -} - -static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - struct kvm_mmu_page *sp; - gfn_t gfn; - kvm_pfn_t pfn; - hpa_t hpa; - - sp = sptep_to_sp(sptep); - - if (sp->unsync) { - if (level != PG_LEVEL_4K) { - audit_printk(vcpu->kvm, "unsync sp: %p " - "level = %d\n", sp, level); - return; - } - } - - if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) - return; - - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn); - - if (is_error_pfn(pfn)) - return; - - hpa = pfn << PAGE_SHIFT; - if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) - audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " - "ent %llxn", vcpu->arch.mmu->root_level, pfn, - hpa, *sptep); -} - -static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) -{ - static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - struct kvm_rmap_head *rmap_head; - struct kvm_mmu_page *rev_sp; - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - gfn_t gfn; - - rev_sp = sptep_to_sp(sptep); - gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - - slots = kvm_memslots_for_spte_role(kvm, rev_sp->role); - slot = __gfn_to_memslot(slots, gfn); - if (!slot) { - if (!__ratelimit(&ratelimit_state)) - return; - audit_printk(kvm, "no memslot for gfn %llx\n", gfn); - audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", - (long int)(sptep - rev_sp->spt), rev_sp->gfn); - dump_stack(); - return; - } - - rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot); - if (!rmap_head->val) { - if (!__ratelimit(&ratelimit_state)) - return; - audit_printk(kvm, "no rmap for writable spte %llx\n", - *sptep); - dump_stack(); - } -} - -static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) - inspect_spte_has_rmap(vcpu->kvm, sptep); -} - -static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - struct kvm_mmu_page *sp = sptep_to_sp(sptep); - - if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) - audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " - "root.\n", sp); -} - -static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - int i; - - if (sp->role.level != PG_LEVEL_4K) - return; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_shadow_present_pte(sp->spt[i])) - continue; - - inspect_spte_has_rmap(kvm, sp->spt + i); - } -} - -static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - struct kvm_rmap_head *rmap_head; - u64 *sptep; - struct rmap_iterator iter; - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - - if (sp->role.direct || sp->unsync || sp->role.invalid) - return; - - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); - - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (is_writable_pte(*sptep)) - audit_printk(kvm, "shadow page has writable " - "mappings: gfn %llx role %x\n", - sp->gfn, sp->role.word); - } -} - -static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - check_mappings_rmap(kvm, sp); - audit_write_protection(kvm, sp); -} - -static void audit_all_active_sps(struct kvm *kvm) -{ - walk_all_active_sps(kvm, audit_sp); -} - -static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - audit_sptes_have_rmaps(vcpu, sptep, level); - audit_mappings(vcpu, sptep, level); - audit_spte_after_sync(vcpu, sptep, level); -} - -static void audit_vcpu_spte(struct kvm_vcpu *vcpu) -{ - mmu_spte_walk(vcpu, audit_spte); -} - -static bool mmu_audit; -static DEFINE_STATIC_KEY_FALSE(mmu_audit_key); - -static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) -{ - static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - - if (!__ratelimit(&ratelimit_state)) - return; - - vcpu->kvm->arch.audit_point = point; - audit_all_active_sps(vcpu->kvm); - audit_vcpu_spte(vcpu); -} - -static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) -{ - if (static_branch_unlikely((&mmu_audit_key))) - __kvm_mmu_audit(vcpu, point); -} - -static void mmu_audit_enable(void) -{ - if (mmu_audit) - return; - - static_branch_inc(&mmu_audit_key); - mmu_audit = true; -} - -static void mmu_audit_disable(void) -{ - if (!mmu_audit) - return; - - static_branch_dec(&mmu_audit_key); - mmu_audit = false; -} - -static int mmu_audit_set(const char *val, const struct kernel_param *kp) -{ - int ret; - unsigned long enable; - - ret = kstrtoul(val, 10, &enable); - if (ret < 0) - return -EINVAL; - - switch (enable) { - case 0: - mmu_audit_disable(); - break; - case 1: - mmu_audit_enable(); - break; - default: - return -EINVAL; - } - - return 0; -} - -static const struct kernel_param_ops audit_param_ops = { - .set = mmu_audit_set, - .get = param_get_bool, -}; - -arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index de5e8e4e1aa7..12247b96af01 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -416,6 +416,29 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + kvm_mmu_split_huge_page, + TP_PROTO(u64 gfn, u64 spte, int level, int errno), + TP_ARGS(gfn, spte, level, errno), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, spte) + __field(int, level) + __field(int, errno) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->spte = spte; + __entry->level = level; + __entry->errno = errno; + ), + + TP_printk("gfn %llx spte %llx level %d errno %d", + __entry->gfn, __entry->spte, __entry->level, __entry->errno) +); + #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 5b5bdac97c7b..aa0e3c246aca 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -904,12 +904,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; - kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; r = FNAME(fetch)(vcpu, fault, &walker); - kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 73cfe62fdad1..4739b53c9734 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -192,6 +192,65 @@ out: return wrprot; } +static u64 make_spte_executable(u64 spte) +{ + bool is_access_track = is_access_track_spte(spte); + + if (is_access_track) + spte = restore_acc_track_spte(spte); + + spte &= ~shadow_nx_mask; + spte |= shadow_x_mask; + + if (is_access_track) + spte = mark_spte_for_access_track(spte); + + return spte; +} + +/* + * Construct an SPTE that maps a sub-page of the given huge page SPTE where + * `index` identifies which sub-page. + * + * This is used during huge page splitting to build the SPTEs that make up the + * new page table. + */ +u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) +{ + u64 child_spte; + int child_level; + + if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) + return 0; + + if (WARN_ON_ONCE(!is_large_pte(huge_spte))) + return 0; + + child_spte = huge_spte; + child_level = huge_level - 1; + + /* + * The child_spte already has the base address of the huge page being + * split. So we just have to OR in the offset to the page at the next + * lower level for the given index. + */ + child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT; + + if (child_level == PG_LEVEL_4K) { + child_spte &= ~PT_PAGE_SIZE_MASK; + + /* + * When splitting to a 4K page, mark the page executable as the + * NX hugepage mitigation no longer applies. + */ + if (is_nx_huge_page_enabled()) + child_spte = make_spte_executable(child_spte); + } + + return child_spte; +} + + u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { u64 spte = SPTE_MMU_PRESENT_MASK; @@ -250,14 +309,7 @@ u64 mark_spte_for_access_track(u64 spte) if (is_access_track_spte(spte)) return spte; - /* - * Making an Access Tracking PTE will result in removal of write access - * from the PTE. So, verify that we will be able to restore the write - * access in the fast page fault path later on. - */ - WARN_ONCE((spte & PT_WRITABLE_MASK) && - !spte_can_locklessly_be_made_writable(spte), - "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + check_spte_writable_invariants(spte); WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), @@ -368,8 +420,8 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_acc_track_mask = 0; shadow_me_mask = sme_me_mask; - shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE; - shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE; + shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE; + shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE; /* * Set a reserved PA bit in MMIO SPTEs to generate page faults with diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index be6a007a4af3..73f12615416f 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -75,33 +75,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); /* - * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits - * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs - * that map guest pages in read-only memslots and read-only VMAs. - * - * Invariants: - * - If Host-writable is clear, PT_WRITABLE_MASK must be clear. - * - * - * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU - * allows writes to the guest page mapped by the SPTE. This bit is cleared when - * the guest page mapped by the SPTE contains a page table that is being - * monitored for shadow paging. In this case the SPTE can only be made writable - * by unsyncing the shadow page under the mmu_lock. - * - * Invariants: - * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear. - * - If MMU-writable is set, Host-writable must be set. - * - * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared - * to track writes for dirty logging. For such SPTEs, KVM will locklessly set - * PT_WRITABLE_MASK upon the next write from the guest and record the write in - * the dirty log (see fast_page_fault()). + * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given + * SPTE is write-protected. See is_writable_pte() for details. */ /* Bits 9 and 10 are ignored by all non-EPT PTEs. */ -#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) -#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) +#define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9) +#define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10) /* * Low ignored bits are at a premium for EPT, use high ignored bits, taking care @@ -339,15 +319,86 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, __is_rsvd_bits_set(rsvd_check, spte, level); } -static inline bool spte_can_locklessly_be_made_writable(u64 spte) +/* + * An shadow-present leaf SPTE may be non-writable for 3 possible reasons: + * + * 1. To intercept writes for dirty logging. KVM write-protects huge pages + * so that they can be split be split down into the dirty logging + * granularity (4KiB) whenever the guest writes to them. KVM also + * write-protects 4KiB pages so that writes can be recorded in the dirty log + * (e.g. if not using PML). SPTEs are write-protected for dirty logging + * during the VM-iotcls that enable dirty logging. + * + * 2. To intercept writes to guest page tables that KVM is shadowing. When a + * guest writes to its page table the corresponding shadow page table will + * be marked "unsync". That way KVM knows which shadow page tables need to + * be updated on the next TLB flush, INVLPG, etc. and which do not. + * + * 3. To prevent guest writes to read-only memory, such as for memory in a + * read-only memslot or guest memory backed by a read-only VMA. Writes to + * such pages are disallowed entirely. + * + * To keep track of why a given SPTE is write-protected, KVM uses 2 + * software-only bits in the SPTE: + * + * shadow_mmu_writable_mask, aka MMU-writable - + * Cleared on SPTEs that KVM is currently write-protecting for shadow paging + * purposes (case 2 above). + * + * shadow_host_writable_mask, aka Host-writable - + * Cleared on SPTEs that are not host-writable (case 3 above) + * + * Note, not all possible combinations of PT_WRITABLE_MASK, + * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given + * SPTE can be in only one of the following states, which map to the + * aforementioned 3 cases: + * + * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK + * ------------------------- | ------------------------ | ---------------- + * 1 | 1 | 1 (writable) + * 1 | 1 | 0 (case 1) + * 1 | 0 | 0 (case 2) + * 0 | 0 | 0 (case 3) + * + * The valid combinations of these bits are checked by + * check_spte_writable_invariants() whenever an SPTE is modified. + * + * Clearing the MMU-writable bit is always done under the MMU lock and always + * accompanied by a TLB flush before dropping the lock to avoid corrupting the + * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging + * (which does not clear the MMU-writable bit), does not flush TLBs before + * dropping the lock, as it only needs to synchronize guest writes with the + * dirty bitmap. + * + * So, there is the problem: clearing the MMU-writable bit can encounter a + * write-protected SPTE while CPUs still have writable mappings for that SPTE + * cached in their TLB. To address this, KVM always flushes TLBs when + * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE. + * + * The Host-writable bit is not modified on present SPTEs, it is only set or + * cleared when an SPTE is first faulted in from non-present and then remains + * immutable. + */ +static inline bool is_writable_pte(unsigned long pte) { - if (spte & shadow_mmu_writable_mask) { - WARN_ON_ONCE(!(spte & shadow_host_writable_mask)); - return true; - } + return pte & PT_WRITABLE_MASK; +} + +/* Note: spte must be a shadow-present leaf SPTE. */ +static inline void check_spte_writable_invariants(u64 spte) +{ + if (spte & shadow_mmu_writable_mask) + WARN_ONCE(!(spte & shadow_host_writable_mask), + "kvm: MMU-writable SPTE is not Host-writable: %llx", + spte); + else + WARN_ONCE(is_writable_pte(spte), + "kvm: Writable SPTE is not MMU-writable: %llx", spte); +} - WARN_ON_ONCE(spte & PT_WRITABLE_MASK); - return false; +static inline bool spte_can_locklessly_be_made_writable(u64 spte) +{ + return spte & shadow_mmu_writable_mask; } static inline u64 get_mmio_spte_generation(u64 spte) @@ -364,9 +415,25 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); +u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); + +/* Restore an acc-track PTE back to a regular PTE */ +static inline u64 restore_acc_track_spte(u64 spte) +{ + u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) + & SHADOW_ACC_TRACK_SAVED_BITS_MASK; + + spte &= ~shadow_acc_track_mask; + spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << + SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); + spte |= saved_bits; + + return spte; +} + u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); void kvm_mmu_reset_all_pte_masks(void); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index caa96c270b95..be3f096db2eb 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -40,17 +40,19 @@ void tdp_iter_restart(struct tdp_iter *iter) * Sets a TDP iterator to walk a pre-order traversal of the paging structure * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ -void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, +void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn) { + int root_level = root->role.level; + WARN_ON(root_level < 1); WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); iter->next_last_level_gfn = next_last_level_gfn; iter->root_level = root_level; iter->min_level = min_level; - iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt; - iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt)); + iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt; + iter->as_id = kvm_mmu_page_as_id(root); tdp_iter_restart(iter); } diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index e19cabbcb65c..216ebbe76ddd 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -57,17 +57,17 @@ struct tdp_iter { * Iterates over every SPTE mapping the GFN range [start, end) in a * preorder traversal. */ -#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \ - for (tdp_iter_start(&iter, root, root_level, min_level, start); \ +#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \ + for (tdp_iter_start(&iter, root, min_level, start); \ iter.valid && iter.gfn < end; \ tdp_iter_next(&iter)) -#define for_each_tdp_pte(iter, root, root_level, start, end) \ - for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) +#define for_each_tdp_pte(iter, root, start, end) \ + for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) tdp_ptep_t spte_to_child_pt(u64 pte, int level); -void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, +void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index bc9e3553fba2..8def8f810cb0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -99,15 +99,18 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Finds the next valid root after root (or the first valid root if root - * is NULL), takes a reference on it, and returns that next root. If root - * is not NULL, this thread should have already taken a reference on it, and - * that reference will be dropped. If no valid root is found, this - * function will return NULL. + * Returns the next root after @prev_root (or the first root if @prev_root is + * NULL). A reference to the returned root is acquired, and the reference to + * @prev_root is released (the caller obviously must hold a reference to + * @prev_root if it's non-NULL). + * + * If @only_valid is true, invalid roots are skipped. + * + * Returns NULL if the end of tdp_mmu_roots was reached. */ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, struct kvm_mmu_page *prev_root, - bool shared) + bool shared, bool only_valid) { struct kvm_mmu_page *next_root; @@ -121,9 +124,14 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, typeof(*next_root), link); - while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root)) + while (next_root) { + if ((!only_valid || !next_root->role.invalid) && + kvm_tdp_mmu_get_root(next_root)) + break; + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, &next_root->link, typeof(*next_root), link); + } rcu_read_unlock(); @@ -143,13 +151,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * mode. In the unlikely event that this thread must free a root, the lock * will be temporarily dropped and reacquired in write mode. */ -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ - for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \ - _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _shared)) \ - if (kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ + for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ + _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ + if (kvm_mmu_page_as_id(_root) != _as_id) { \ } else +#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) + +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false) + #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ lockdep_is_held_type(&kvm->mmu_lock, 0) || \ @@ -157,57 +171,63 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, if (kvm_mmu_page_as_id(_root) != _as_id) { \ } else -static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, - int level) +static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) { - union kvm_mmu_page_role role; + struct kvm_mmu_page *sp; - role = vcpu->arch.mmu->mmu_role.base; - role.level = level; - role.direct = true; - role.has_4_byte_gpte = false; - role.access = ACC_ALL; - role.ad_disabled = !shadow_accessed_mask; + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); - return role; + return sp; } -static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, - int level) +static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, gfn_t gfn, + union kvm_mmu_page_role role) { - struct kvm_mmu_page *sp; - - sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - sp->role.word = page_role_for_level(vcpu, level).word; + sp->role = role; sp->gfn = gfn; sp->tdp_mmu_page = true; trace_kvm_mmu_get_page(sp, true); +} - return sp; +static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, + struct tdp_iter *iter) +{ + struct kvm_mmu_page *parent_sp; + union kvm_mmu_page_role role; + + parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); + + role = parent_sp->role; + role.level--; + + tdp_mmu_init_sp(child_sp, iter->gfn, role); } hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) { - union kvm_mmu_page_role role; + union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; lockdep_assert_held_write(&kvm->mmu_lock); - role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); - - /* Check for an existing root before allocating a new one. */ + /* + * Check for an existing root before allocating a new one. Note, the + * role check prevents consuming an invalid root. + */ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { if (root->role.word == role.word && - kvm_tdp_mmu_get_root(kvm, root)) + kvm_tdp_mmu_get_root(root)) goto out; } - root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); + root = tdp_mmu_alloc_sp(vcpu); + tdp_mmu_init_sp(root, 0, role); + refcount_set(&root->tdp_mmu_root_count, 1); spin_lock(&kvm->arch.tdp_mmu_pages_lock); @@ -252,25 +272,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, } /** - * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU - * - * @kvm: kvm instance - * @sp: the new page - * @account_nx: This page replaces a NX large page and should be marked for - * eventual reclaim. - */ -static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool account_nx) -{ - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - list_add(&sp->link, &kvm->arch.tdp_mmu_pages); - if (account_nx) - account_huge_nx_page(kvm, sp); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); -} - -/** - * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU + * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages * * @kvm: kvm instance * @sp: the page to be removed @@ -278,8 +280,8 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, * the MMU lock and the operation must synchronize with other * threads that might be adding or removing pages. */ -static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool shared) +static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, + bool shared) { if (shared) spin_lock(&kvm->arch.tdp_mmu_pages_lock); @@ -295,7 +297,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, } /** - * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure + * handle_removed_pt() - handle a page table removed from the TDP structure * * @kvm: kvm instance * @pt: the page removed from the paging structure @@ -311,8 +313,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, * this thread will be responsible for ensuring the page is freed. Hence the * early rcu_dereferences in the function. */ -static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, - bool shared) +static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) { struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; @@ -321,7 +322,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, trace_kvm_mmu_prepare_zap_page(sp); - tdp_mmu_unlink_page(kvm, sp, shared); + tdp_mmu_unlink_sp(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { u64 *sptep = rcu_dereference(pt) + i; @@ -435,6 +436,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + if (is_leaf) + check_spte_writable_invariants(new_spte); + /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ @@ -472,8 +476,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * the paging structure. */ if (was_present && !was_leaf && (pfn_changed || !is_present)) - handle_removed_tdp_mmu_page(kvm, - spte_to_child_pt(old_spte, level), shared); + handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, @@ -492,16 +495,25 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * and handle the associated bookkeeping. Do not mark the page dirty * in KVM's dirty bitmaps. * + * If setting the SPTE fails because it has changed, iter->old_spte will be + * refreshed to the current value of the spte. + * * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @new_spte: The value the SPTE should be set to - * Returns: true if the SPTE was set, false if it was not. If false is returned, - * this function will have no side-effects. + * Return: + * * 0 - If the SPTE was set. + * * -EBUSY - If the SPTE cannot be set. In this case this function will have + * no side-effects other than setting iter->old_spte to the last + * known value of the spte. */ -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { + u64 *sptep = rcu_dereference(iter->sptep); + u64 old_spte; + WARN_ON_ONCE(iter->yielded); lockdep_assert_held_read(&kvm->mmu_lock); @@ -511,34 +523,45 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, * may modify it. */ if (is_removed_spte(iter->old_spte)) - return false; + return -EBUSY; /* * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and * does not hold the mmu_lock. */ - if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, - new_spte) != iter->old_spte) - return false; + old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); + if (old_spte != iter->old_spte) { + /* + * The page table entry was modified by a different logical + * CPU. Refresh iter->old_spte with the current value so the + * caller operates on fresh data, e.g. if it retries + * tdp_mmu_set_spte_atomic(). + */ + iter->old_spte = old_spte; + return -EBUSY; + } __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level, true); handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); - return true; + return 0; } -static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter) +static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter) { + int ret; + /* * Freeze the SPTE by setting it to a special, * non-present value. This will stop other threads from * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) - return false; + ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); + if (ret) + return ret; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, KVM_PAGES_PER_HPAGE(iter->level)); @@ -553,7 +576,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, */ WRITE_ONCE(*rcu_dereference(iter->sptep), 0); - return true; + return 0; } @@ -624,7 +647,7 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, } #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ - for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) + for_each_tdp_pte(_iter, _root, _start, _end) #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ tdp_root_for_each_pte(_iter, _root, _start, _end) \ @@ -634,8 +657,7 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, else #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ - for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ - _mmu->shadow_root_level, _start, _end) + for_each_tdp_pte(_iter, to_shadow_page(_mmu->root_hpa), _start, _end) /* * Yield if the MMU lock is contended or this thread needs to return control @@ -724,8 +746,7 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root->spt, root->role.level, - min_level, start, end) { + for_each_tdp_pte_min_level(iter, root, min_level, start, end) { retry: if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { @@ -750,12 +771,7 @@ retry: if (!shared) { tdp_mmu_set_spte(kvm, &iter, 0); flush = true; - } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + } else if (tdp_mmu_zap_spte_atomic(kvm, &iter)) { goto retry; } } @@ -913,7 +929,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; /* @@ -947,6 +963,44 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, } /* + * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the + * provided page table. + * + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @sp: The new TDP page table to install. + * @account_nx: True if this page table is being installed to split a + * non-executable huge page. + * @shared: This operation is running under the MMU lock in read mode. + * + * Returns: 0 if the new page table was installed. Non-0 if the page table + * could not be installed (e.g. the atomic compare-exchange failed). + */ +static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *sp, bool account_nx, + bool shared) +{ + u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); + int ret = 0; + + if (shared) { + ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); + if (ret) + return ret; + } else { + tdp_mmu_set_spte(kvm, iter, spte); + } + + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + list_add(&sp->link, &kvm->arch.tdp_mmu_pages); + if (account_nx) + account_huge_nx_page(kvm, sp); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + + return 0; +} + +/* * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. */ @@ -955,8 +1009,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) struct kvm_mmu *mmu = vcpu->arch.mmu; struct tdp_iter iter; struct kvm_mmu_page *sp; - u64 *child_pt; - u64 new_spte; int ret; kvm_mmu_hugepage_adjust(vcpu, fault); @@ -979,7 +1031,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) */ if (is_shadow_present_pte(iter.old_spte) && is_large_pte(iter.old_spte)) { - if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) + if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) break; /* @@ -991,6 +1043,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } if (!is_shadow_present_pte(iter.old_spte)) { + bool account_nx = fault->huge_page_disallowed && + fault->req_level >= iter.level; + /* * If SPTE has been frozen by another thread, just * give up and retry, avoiding unnecessary page table @@ -999,19 +1054,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (is_removed_spte(iter.old_spte)) break; - sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); - child_pt = sp->spt; - - new_spte = make_nonleaf_spte(child_pt, - !shadow_accessed_mask); + sp = tdp_mmu_alloc_sp(vcpu); + tdp_mmu_init_child_sp(sp, &iter); - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) { - tdp_mmu_link_page(vcpu->kvm, sp, - fault->huge_page_disallowed && - fault->req_level >= iter.level); - - trace_kvm_mmu_get_page(sp, true); - } else { + if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { tdp_mmu_free_sp(sp); break; } @@ -1032,13 +1078,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { - struct kvm_mmu_page *root; - - for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) - flush = zap_gfn_range(kvm, root, range->start, range->end, - range->may_block, flush, false); - - return flush; + return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start, + range->end, range->may_block, flush); } typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, @@ -1180,8 +1221,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); - for_each_tdp_pte_min_level(iter, root->spt, root->role.level, - min_level, start, end) { + for_each_tdp_pte_min_level(iter, root, min_level, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; @@ -1193,14 +1233,9 @@ retry: new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) goto retry; - } + spte_set = true; } @@ -1221,13 +1256,197 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); return spte_set; } +static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) +{ + struct kvm_mmu_page *sp; + + gfp |= __GFP_ZERO; + + sp = kmem_cache_alloc(mmu_page_header_cache, gfp); + if (!sp) + return NULL; + + sp->spt = (void *)__get_free_page(gfp); + if (!sp->spt) { + kmem_cache_free(mmu_page_header_cache, sp); + return NULL; + } + + return sp; +} + +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, + struct tdp_iter *iter, + bool shared) +{ + struct kvm_mmu_page *sp; + + /* + * Since we are allocating while under the MMU lock we have to be + * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct + * reclaim and to avoid making any filesystem callbacks (which can end + * up invoking KVM MMU notifiers, resulting in a deadlock). + * + * If this allocation fails we drop the lock and retry with reclaim + * allowed. + */ + sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); + if (sp) + return sp; + + rcu_read_unlock(); + + if (shared) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); + + iter->yielded = true; + sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); + + if (shared) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); + + rcu_read_lock(); + + return sp; +} + +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *sp, bool shared) +{ + const u64 huge_spte = iter->old_spte; + const int level = iter->level; + int ret, i; + + tdp_mmu_init_child_sp(sp, iter); + + /* + * No need for atomics when writing to sp->spt since the page table has + * not been linked in yet and thus is not reachable from any other CPU. + */ + for (i = 0; i < PT64_ENT_PER_PAGE; i++) + sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); + + /* + * Replace the huge spte with a pointer to the populated lower level + * page table. Since we are making this change without a TLB flush vCPUs + * will see a mix of the split mappings and the original huge mapping, + * depending on what's currently in their TLB. This is fine from a + * correctness standpoint since the translation will be the same either + * way. + */ + ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); + if (ret) + goto out; + + /* + * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we + * are overwriting from the page stats. But we have to manually update + * the page stats with the new present child pages. + */ + kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); + +out: + trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); + return ret; +} + +static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, + struct kvm_mmu_page *root, + gfn_t start, gfn_t end, + int target_level, bool shared) +{ + struct kvm_mmu_page *sp = NULL; + struct tdp_iter iter; + int ret = 0; + + rcu_read_lock(); + + /* + * Traverse the page table splitting all huge pages above the target + * level into one lower level. For example, if we encounter a 1GB page + * we split it into 512 2MB pages. + * + * Since the TDP iterator uses a pre-order traversal, we are guaranteed + * to visit an SPTE before ever visiting its children, which means we + * will correctly recursively split huge pages that are more than one + * level above the target level (e.g. splitting a 1GB to 512 2MB pages, + * and then splitting each of those to 512 4KB pages). + */ + for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) + continue; + + if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) + continue; + + if (!sp) { + sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); + if (!sp) { + ret = -ENOMEM; + trace_kvm_mmu_split_huge_page(iter.gfn, + iter.old_spte, + iter.level, ret); + break; + } + + if (iter.yielded) + continue; + } + + if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) + goto retry; + + sp = NULL; + } + + rcu_read_unlock(); + + /* + * It's possible to exit the loop having never used the last sp if, for + * example, a vCPU doing HugePage NX splitting wins the race and + * installs its own sp in place of the last sp we tried to split. + */ + if (sp) + tdp_mmu_free_sp(sp); + + return ret; +} + + +/* + * Try to split all huge pages mapped by the TDP MMU down to the target level. + */ +void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level, bool shared) +{ + struct kvm_mmu_page *root; + int r = 0; + + kvm_lockdep_assert_mmu_lock_held(kvm, shared); + + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { + r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); + if (r) { + kvm_tdp_mmu_put_root(kvm, root, shared); + break; + } + } +} + /* * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. @@ -1261,14 +1480,9 @@ retry: continue; } - if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) goto retry; - } + spte_set = true; } @@ -1291,7 +1505,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); @@ -1392,14 +1606,8 @@ retry: continue; /* Note, a successful atomic zap also does a remote TLB flush. */ - if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { - /* - * The iter must explicitly re-read the SPTE because - * the atomic cmpxchg failed. - */ - iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + if (tdp_mmu_zap_spte_atomic(kvm, &iter)) goto retry; - } } rcu_read_unlock(); @@ -1416,7 +1624,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) zap_collapsible_spte_range(kvm, root, slot); } @@ -1436,8 +1644,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root->spt, root->role.level, - min_level, gfn, gfn + 1) { + for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 3899004a5d91..3f987785702a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -7,12 +7,8 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); -__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm, - struct kvm_mmu_page *root) +__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) { - if (root->role.invalid) - return false; - return refcount_inc_not_zero(&root->tdp_mmu_root_count); } @@ -71,6 +67,11 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level); +void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level, bool shared); + static inline void kvm_tdp_mmu_walk_lockless_begin(void) { rcu_read_lock(); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f614f95acc6b..b1a02993782b 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -95,7 +95,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event, } static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, - unsigned config, bool exclude_user, + u64 config, bool exclude_user, bool exclude_kernel, bool intr, bool in_tx, bool in_tx_cp) { @@ -181,7 +181,8 @@ static int cmp_u64(const void *a, const void *b) void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { - unsigned config, type = PERF_TYPE_RAW; + u64 config; + u32 type = PERF_TYPE_RAW; struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; bool allow_event = true; @@ -220,7 +221,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) } if (type == PERF_TYPE_RAW) - config = eventsel & X86_RAW_EVENT_MASK; + config = eventsel & AMD64_RAW_EVENT_MASK; if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) return; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 90364d02f22a..d4fa8c4f3a9a 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -27,20 +27,6 @@ #include "irq.h" #include "svm.h" -#define SVM_AVIC_DOORBELL 0xc001011b - -#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) - -/* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. - */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 255 - -#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 -#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 -#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF - /* AVIC GATAG is encoded using VM and VCPU IDs */ #define AVIC_VCPU_ID_BITS 8 #define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) @@ -73,12 +59,6 @@ struct amd_svm_iommu_ir { void *data; /* Storing pointer to struct amd_ir_data */ }; -enum avic_ipi_failure_cause { - AVIC_IPI_FAILURE_INVALID_INT_TYPE, - AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, - AVIC_IPI_FAILURE_INVALID_TARGET, - AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, -}; /* Note: * This function is called from IOMMU driver to notify @@ -289,6 +269,22 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +void avic_ring_doorbell(struct kvm_vcpu *vcpu) +{ + /* + * Note, the vCPU could get migrated to a different pCPU at any point, + * which could result in signalling the wrong/previous pCPU. But if + * that happens the vCPU is guaranteed to do a VMRUN (after being + * migrated) and thus will process pending interrupts, i.e. a doorbell + * is not needed (and the spurious one is harmless). + */ + int cpu = READ_ONCE(vcpu->cpu); + + if (cpu != get_cpu()) + wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + put_cpu(); +} + static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh) { @@ -304,8 +300,13 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, kvm_for_each_vcpu(i, vcpu, kvm) { if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK)) - kvm_vcpu_wake_up(vcpu); + icrl & APIC_DEST_MASK)) { + vcpu->arch.apic->irr_pending = true; + svm_complete_interrupt_delivery(vcpu, + icrl & APIC_MODE_MASK, + icrl & APIC_INT_LEVELTRIG, + icrl & APIC_VECTOR_MASK); + } } } @@ -345,8 +346,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh); break; case AVIC_IPI_FAILURE_INVALID_TARGET: - WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", - index, vcpu->vcpu_id, icrh, icrl); break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); @@ -579,7 +578,7 @@ int avic_init_vcpu(struct vcpu_svm *svm) return ret; } -void avic_post_state_restore(struct kvm_vcpu *vcpu) +void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) { if (avic_handle_apic_id_update(vcpu) != 0) return; @@ -587,20 +586,7 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) -{ - return; -} - -void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) -{ -} - -void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) -{ -} - -static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) +static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; unsigned long flags; @@ -632,7 +618,7 @@ out: return ret; } -void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb01.ptr; @@ -649,7 +635,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) * we need to check and update the AVIC logical APIC ID table * accordingly before re-activating. */ - avic_post_state_restore(vcpu); + avic_apicv_post_state_restore(vcpu); vmcb->control.int_ctl |= AVIC_ENABLE_MASK; } else { vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; @@ -661,63 +647,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) else avic_vcpu_put(vcpu); - svm_set_pi_irte_mode(vcpu, activated); -} - -void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) -{ - return; -} - -int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) -{ - if (!vcpu->arch.apicv_active) - return -1; - - kvm_lapic_set_irr(vec, vcpu->arch.apic); - - /* - * Pairs with the smp_mb_*() after setting vcpu->guest_mode in - * vcpu_enter_guest() to ensure the write to the vIRR is ordered before - * the read of guest_mode, which guarantees that either VMRUN will see - * and process the new vIRR entry, or that the below code will signal - * the doorbell if the vCPU is already running in the guest. - */ - smp_mb__after_atomic(); - - /* - * Signal the doorbell to tell hardware to inject the IRQ if the vCPU - * is in the guest. If the vCPU is not in the guest, hardware will - * automatically process AVIC interrupts at VMRUN. - */ - if (vcpu->mode == IN_GUEST_MODE) { - int cpu = READ_ONCE(vcpu->cpu); - - /* - * Note, the vCPU could get migrated to a different pCPU at any - * point, which could result in signalling the wrong/previous - * pCPU. But if that happens the vCPU is guaranteed to do a - * VMRUN (after being migrated) and thus will process pending - * interrupts, i.e. a doorbell is not needed (and the spurious - * one is harmless). - */ - if (cpu != get_cpu()) - wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); - put_cpu(); - } else { - /* - * Wake the vCPU if it was blocking. KVM will then detect the - * pending IRQ when checking if the vCPU has a wake event. - */ - kvm_vcpu_wake_up(vcpu); - } - - return 0; -} - -bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - return false; + avic_set_pi_irte_mode(vcpu, activated); } static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) @@ -817,7 +747,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, } /* - * svm_update_pi_irte - set IRTE for Posted-Interrupts + * avic_pi_update_irte - set IRTE for Posted-Interrupts * * @kvm: kvm * @host_irq: host irq of the interrupt @@ -825,8 +755,8 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, * @set: set or unset PI * returns 0 on success, < 0 on failure */ -int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; @@ -926,7 +856,7 @@ out: return ret; } -bool svm_check_apicv_inhibit_reasons(ulong bit) +bool avic_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_ABSENT) | diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h new file mode 100644 index 000000000000..7d6d97968fb9 --- /dev/null +++ b/arch/x86/kvm/svm/hyperv.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM). + */ + +#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__ +#define __ARCH_X86_KVM_SVM_HYPERV_H__ + +#include <asm/mshyperv.h> + +#include "../hyperv.h" + +/* + * Hyper-V uses the software reserved 32 bytes in VMCB + * control area to expose SVM enlightenments to guests. + */ +struct hv_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB + */ +#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW + +#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 1218b5a342fc..f284e61451c8 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -28,6 +28,7 @@ #include "cpuid.h" #include "lapic.h" #include "svm.h" +#include "hyperv.h" #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK @@ -165,14 +166,30 @@ void recalc_intercepts(struct vcpu_svm *svm) vmcb_set_intercept(c, INTERCEPT_VMSAVE); } +/* + * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function + * is optimized in that it only merges the parts where KVM MSR permission bitmap + * may contain zero bits. + */ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { + struct hv_enlightenments *hve = + (struct hv_enlightenments *)svm->nested.ctl.reserved_sw; + int i; + /* - * This function merges the msr permission bitmaps of kvm and the - * nested vmcb. It is optimized in that it only merges the parts where - * the kvm msr permission bitmap may contain zero bits + * MSR bitmap update can be skipped when: + * - MSR bitmap for L1 hasn't changed. + * - Nested hypervisor (L1) is attempting to launch the same L2 as + * before. + * - Nested hypervisor (L1) is using Hyper-V emulation interface and + * tells KVM (L0) there were no changes in MSR bitmap for L2. */ - int i; + if (!svm->nested.force_msr_bitmap_recalc && + kvm_hv_hypercall_enabled(&svm->vcpu) && + hve->hv_enlightenments_control.msr_bitmap && + (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS))) + goto set_msrpm_base_pa; if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; @@ -193,6 +210,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) svm->nested.msrpm[p] = svm->msrpm[p] | value; } + svm->nested.force_msr_bitmap_recalc = false; + +set_msrpm_base_pa: svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); return true; @@ -298,7 +318,8 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) } static -void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to, +void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, + struct vmcb_ctrl_area_cached *to, struct vmcb_control_area *from) { unsigned int i; @@ -331,12 +352,19 @@ void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to, to->asid = from->asid; to->msrpm_base_pa &= ~0x0fffULL; to->iopm_base_pa &= ~0x0fffULL; + + /* Hyper-V extensions (Enlightened VMCB) */ + if (kvm_hv_hypercall_enabled(vcpu)) { + to->clean = from->clean; + memcpy(to->reserved_sw, from->reserved_sw, + sizeof(struct hv_enlightenments)); + } } void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, struct vmcb_control_area *control) { - __nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control); + __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control); } static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, @@ -494,6 +522,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { new_vmcb12 = true; svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa; + svm->nested.force_msr_bitmap_recalc = true; } if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { @@ -1302,6 +1331,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->virt_ext = from->virt_ext; dst->pause_filter_count = from->pause_filter_count; dst->pause_filter_thresh = from->pause_filter_thresh; + /* 'clean' and 'reserved_sw' are not changed by KVM */ } static int svm_get_nested_state(struct kvm_vcpu *vcpu, @@ -1434,7 +1464,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; ret = -EINVAL; - __nested_copy_vmcb_control_to_cache(&ctl_cached, ctl); + __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) goto out_free; @@ -1457,18 +1487,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, !__nested_vmcb_check_save(vcpu, &save_cached)) goto out_free; - /* - * While the nested guest CR3 is already checked and set by - * KVM_SET_SREGS, it was set when nested state was yet loaded, - * thus MMU might not be initialized correctly. - * Set it again to fix this. - */ - - ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, - nested_npt_enabled(svm), false); - if (WARN_ON_ONCE(ret)) - goto out_free; - /* * All checks done, we can enter guest mode. Userspace provides @@ -1494,6 +1512,21 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm_switch_vmcb(svm, &svm->nested.vmcb02); nested_vmcb02_prepare_control(svm); + + /* + * While the nested guest CR3 is already checked and set by + * KVM_SET_SREGS, it was set when nested state was yet loaded, + * thus MMU might not be initialized correctly. + * Set it again to fix this. + */ + + ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, + nested_npt_enabled(svm), false); + if (WARN_ON_ONCE(ret)) + goto out_free; + + svm->nested.force_msr_bitmap_recalc = true; + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 17b53457d866..789b69294d28 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -258,6 +258,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free; INIT_LIST_HEAD(&sev->regions_list); + INIT_LIST_HEAD(&sev->mirror_vms); return 0; @@ -1623,9 +1624,12 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm) } } -static void sev_migrate_from(struct kvm_sev_info *dst, - struct kvm_sev_info *src) +static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { + struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *mirror; + dst->active = true; dst->asid = src->asid; dst->handle = src->handle; @@ -1639,6 +1643,30 @@ static void sev_migrate_from(struct kvm_sev_info *dst, src->enc_context_owner = NULL; list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); + + /* + * If this VM has mirrors, "transfer" each mirror's refcount of the + * source to the destination (this KVM). The caller holds a reference + * to the source, so there's no danger of use-after-free. + */ + list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms); + list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) { + kvm_get_kvm(dst_kvm); + kvm_put_kvm(src_kvm); + mirror->enc_context_owner = dst_kvm; + } + + /* + * If this VM is a mirror, remove the old mirror from the owners list + * and add the new mirror to the list. + */ + if (is_mirroring_enc_context(dst_kvm)) { + struct kvm_sev_info *owner_sev_info = + &to_kvm_svm(dst->enc_context_owner)->sev_info; + + list_del(&src->mirror_entry); + list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); + } } static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) @@ -1681,7 +1709,7 @@ static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) return 0; } -int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) +int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_info *src_sev, *cg_cleanup_sev; @@ -1708,15 +1736,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) src_sev = &to_kvm_svm(source_kvm)->sev_info; - /* - * VMs mirroring src's encryption context rely on it to keep the - * ASID allocated, but below we are clearing src_sev->asid. - */ - if (src_sev->num_mirrored_vms) { - ret = -EBUSY; - goto out_unlock; - } - dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; if (dst_sev->misc_cg != src_sev->misc_cg) { @@ -1738,7 +1757,8 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) if (ret) goto out_source_vcpu; } - sev_migrate_from(dst_sev, src_sev); + + sev_migrate_from(kvm, source_kvm); kvm_vm_dead(source_kvm); cg_cleanup_sev = src_sev; ret = 0; @@ -1761,7 +1781,7 @@ out_fput: return ret; } -int svm_mem_enc_op(struct kvm *kvm, void __user *argp) +int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; int r; @@ -1858,8 +1878,8 @@ out: return r; } -int svm_register_enc_region(struct kvm *kvm, - struct kvm_enc_region *range) +int sev_mem_enc_register_region(struct kvm *kvm, + struct kvm_enc_region *range) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct enc_region *region; @@ -1932,8 +1952,8 @@ static void __unregister_enc_region_locked(struct kvm *kvm, kfree(region); } -int svm_unregister_enc_region(struct kvm *kvm, - struct kvm_enc_region *range) +int sev_mem_enc_unregister_region(struct kvm *kvm, + struct kvm_enc_region *range) { struct enc_region *region; int ret; @@ -1972,7 +1992,7 @@ failed: return ret; } -int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) +int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; @@ -2008,10 +2028,10 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) */ source_sev = &to_kvm_svm(source_kvm)->sev_info; kvm_get_kvm(source_kvm); - source_sev->num_mirrored_vms++; + mirror_sev = &to_kvm_svm(kvm)->sev_info; + list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ - mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; mirror_sev->active = true; mirror_sev->asid = source_sev->asid; @@ -2019,6 +2039,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) mirror_sev->es_active = source_sev->es_active; mirror_sev->handle = source_sev->handle; INIT_LIST_HEAD(&mirror_sev->regions_list); + INIT_LIST_HEAD(&mirror_sev->mirror_vms); ret = 0; /* @@ -2041,19 +2062,17 @@ void sev_vm_destroy(struct kvm *kvm) struct list_head *head = &sev->regions_list; struct list_head *pos, *q; - WARN_ON(sev->num_mirrored_vms); - if (!sev_guest(kvm)) return; + WARN_ON(!list_empty(&sev->mirror_vms)); + /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ if (is_mirroring_enc_context(kvm)) { struct kvm *owner_kvm = sev->enc_context_owner; - struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info; mutex_lock(&owner_kvm->lock); - if (!WARN_ON(!owner_sev->num_mirrored_vms)) - owner_sev->num_mirrored_vms--; + list_del(&sev->mirror_entry); mutex_unlock(&owner_kvm->lock); kvm_put_kvm(owner_kvm); return; @@ -2173,7 +2192,7 @@ out: #endif } -void sev_hardware_teardown(void) +void sev_hardware_unsetup(void) { if (!sev_enabled) return; @@ -2907,20 +2926,16 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) sev_enc_bit)); } -void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu) +void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa) { - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - struct vmcb_save_area *hostsa; - /* * As an SEV-ES guest, hardware will restore the host state on VMEXIT, - * of which one step is to perform a VMLOAD. Since hardware does not - * perform a VMSAVE on VMRUN, the host savearea must be updated. + * of which one step is to perform a VMLOAD. KVM performs the + * corresponding VMSAVE in svm_prepare_guest_switch for both + * traditional and SEV-ES guests. */ - vmsave(__sme_page_pa(sd->save_area)); /* XCR0 is restored on VMEXIT, save the current host value */ - hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); /* PKRU is restored on VMEXIT, save the current host value */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a290efb272ad..7038c76fa841 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -263,7 +263,7 @@ u32 svm_msrpm_offset(u32 msr) return MSR_INVALID; } -#define MAX_INST_SIZE 15 +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); static int get_npt_level(void) { @@ -353,7 +353,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static int skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -401,7 +401,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) * raises a fault that is not intercepted. Still better than * failing in all cases. */ - (void)skip_emulated_instruction(vcpu); + (void)svm_skip_emulated_instruction(vcpu); rip = kvm_rip_read(vcpu); svm->int3_rip = rip + svm->vmcb->save.cs.base; svm->int3_injected = rip - old_rip; @@ -668,6 +668,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write) { + struct vcpu_svm *svm = to_svm(vcpu); u8 bit_read, bit_write; unsigned long tmp; u32 offset; @@ -698,7 +699,7 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, msrpm[offset] = tmp; svm_hv_vmcb_dirty_nested_enlightenments(vcpu); - + svm->nested.force_msr_bitmap_recalc = true; } void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, @@ -873,11 +874,11 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -static void svm_hardware_teardown(void) +static void svm_hardware_unsetup(void) { int cpu; - sev_hardware_teardown(); + sev_hardware_unsetup(); for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -1175,7 +1176,7 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) svm->vmcb = target_vmcb->ptr; } -static int svm_create_vcpu(struct kvm_vcpu *vcpu) +static int svm_vcpu_create(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb01_page; @@ -1246,7 +1247,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb) cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); } -static void svm_free_vcpu(struct kvm_vcpu *vcpu) +static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1265,7 +1266,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) +static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); @@ -1280,10 +1281,12 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) * Save additional host state that will be restored on VMEXIT (sev-es) * or subsequent vmload of host save area. */ + vmsave(__sme_page_pa(sd->save_area)); if (sev_es_guest(vcpu->kvm)) { - sev_es_prepare_guest_switch(svm, vcpu->cpu); - } else { - vmsave(__sme_page_pa(sd->save_area)); + struct vmcb_save_area *hostsa; + hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); + + sev_es_prepare_switch_to_guest(hostsa); } if (tsc_scaling) { @@ -1529,6 +1532,15 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu) return save->cpl; } +static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct kvm_segment cs; + + svm_get_segment(vcpu, &cs, VCPU_SREG_CS); + *db = cs.db; + *l = cs.l; +} + static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1563,7 +1575,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) vmcb_mark_dirty(svm->vmcb, VMCB_DT); } -static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1585,6 +1597,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); u64 hcr0 = cr0; + bool old_paging = is_paging(vcpu); #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { @@ -1601,8 +1614,11 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) #endif vcpu->arch.cr0 = cr0; - if (!npt_enabled) + if (!npt_enabled) { hcr0 |= X86_CR0_PG | X86_CR0_WP; + if (old_paging != is_paging(vcpu)) + svm_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } /* * re-enable caching here because the QEMU bios @@ -1643,11 +1659,15 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long old_cr4 = vcpu->arch.cr4; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu); + svm_flush_tlb_current(vcpu); vcpu->arch.cr4 = cr4; - if (!npt_enabled) + if (!npt_enabled) { cr4 |= X86_CR4_PAE; + + if (!is_paging(vcpu)) + cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); + } cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); @@ -2261,7 +2281,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { - if (!skip_emulated_instruction(vcpu)) + if (!svm_skip_emulated_instruction(vcpu)) return 0; } @@ -3126,7 +3146,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code) +static bool svm_check_exit_valid(u64 exit_code) { return (exit_code < ARRAY_SIZE(svm_exit_handlers) && svm_exit_handlers[exit_code]); @@ -3146,7 +3166,7 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (!svm_check_exit_valid(vcpu, exit_code)) + if (!svm_check_exit_valid(exit_code)) return svm_handle_invalid_exit(vcpu, exit_code); #ifdef CONFIG_RETPOLINE @@ -3181,7 +3201,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, *error_code = 0; } -static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -3278,7 +3298,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) ++vcpu->stat.nmi_injections; } -static void svm_set_irq(struct kvm_vcpu *vcpu) +static void svm_inject_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3291,21 +3311,55 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } -static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, - int trig_mode, int vector) +void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, + int trig_mode, int vector) { - struct kvm_vcpu *vcpu = apic->vcpu; + /* + * vcpu->arch.apicv_active must be read after vcpu->mode. + * Pairs with smp_store_release in vcpu_enter_guest. + */ + bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); - if (svm_deliver_avic_intr(vcpu, vector)) { - kvm_lapic_set_irr(vector, apic); + if (!READ_ONCE(vcpu->arch.apicv_active)) { + /* Process the interrupt via inject_pending_event */ kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); + return; + } + + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); + if (in_guest_mode) { + /* + * Signal the doorbell to tell hardware to inject the IRQ. If + * the vCPU exits the guest before the doorbell chimes, hardware + * will automatically process AVIC interrupts at the next VMRUN. + */ + avic_ring_doorbell(vcpu); } else { - trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector); + /* + * Wake the vCPU if it was blocking. KVM will then detect the + * pending IRQ when checking if the vCPU has a wake event. + */ + kvm_vcpu_wake_up(vcpu); } } +static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + kvm_lapic_set_irr(vector, apic); + + /* + * Pairs with the smp_mb_*() after setting vcpu->guest_mode in + * vcpu_enter_guest() to ensure the write to the vIRR is ordered before + * the read of guest_mode. This guarantees that either VMRUN will see + * and process the new vIRR entry, or that svm_complete_interrupt_delivery + * will signal the doorbell if the CPU has already entered the guest. + */ + smp_mb__after_atomic(); + svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector); +} + static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3353,11 +3407,13 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_nmi_blocked(vcpu)) + return 0; + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) return -EBUSY; - - return !svm_nmi_blocked(vcpu); + return 1; } static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -3409,9 +3465,13 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); + if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_interrupt_blocked(vcpu)) + return 0; + /* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events. @@ -3419,7 +3479,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) return -EBUSY; - return !svm_interrupt_blocked(vcpu); + return 1; } static void svm_enable_irq_window(struct kvm_vcpu *vcpu) @@ -3468,17 +3528,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } -static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - return 0; -} - -static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) -{ - return 0; -} - -void svm_flush_tlb(struct kvm_vcpu *vcpu) +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3852,11 +3902,6 @@ static int __init svm_check_processor_compat(void) return 0; } -static bool svm_cpu_has_accelerated_tpr(void) -{ - return false; -} - /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. @@ -3879,11 +3924,6 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4150,11 +4190,14 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_smi_blocked(vcpu)) + return 0; + /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) return -EBUSY; - return !svm_smi_blocked(vcpu); + return 1; } static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) @@ -4188,7 +4231,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) * by 0x400 (matches the offset of 'struct vmcb_save_area' * within 'struct vmcb'). Note: HSAVE area may also be used by * L1 hypervisor to save additional host context (e.g. KVM does - * that, see svm_prepare_guest_switch()) which must be + * that, see svm_prepare_switch_to_guest()) which must be * preserved. */ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), @@ -4248,11 +4291,18 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) * Enter the nested guest now */ + vmcb_mark_all_dirty(svm->vmcb01.ptr); + vmcb12 = map.hva; nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); + if (ret) + goto unmap_save; + + svm->nested.nested_run_pending = 1; + unmap_save: kvm_vcpu_unmap(vcpu, &map_save, true); unmap_map: @@ -4456,21 +4506,20 @@ static int svm_vm_init(struct kvm *kvm) static struct kvm_x86_ops svm_x86_ops __initdata = { .name = "kvm_amd", - .hardware_unsetup = svm_hardware_teardown, + .hardware_unsetup = svm_hardware_unsetup, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, - .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, .has_emulated_msr = svm_has_emulated_msr, - .vcpu_create = svm_create_vcpu, - .vcpu_free = svm_free_vcpu, + .vcpu_create = svm_vcpu_create, + .vcpu_free = svm_vcpu_free, .vcpu_reset = svm_vcpu_reset, .vm_size = sizeof(struct kvm_svm), .vm_init = svm_vm_init, .vm_destroy = svm_vm_destroy, - .prepare_guest_switch = svm_prepare_guest_switch, + .prepare_switch_to_guest = svm_prepare_switch_to_guest, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, .vcpu_blocking = avic_vcpu_blocking, @@ -4484,9 +4533,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_segment = svm_get_segment, .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, - .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .get_cs_db_l_bits = svm_get_cs_db_l_bits, .set_cr0 = svm_set_cr0, - .post_set_cr3 = svm_post_set_cr3, + .post_set_cr3 = sev_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, @@ -4501,21 +4550,21 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_rflags = svm_set_rflags, .get_if_flag = svm_get_if_flag, - .tlb_flush_all = svm_flush_tlb, - .tlb_flush_current = svm_flush_tlb, - .tlb_flush_gva = svm_flush_tlb_gva, - .tlb_flush_guest = svm_flush_tlb, + .flush_tlb_all = svm_flush_tlb_current, + .flush_tlb_current = svm_flush_tlb_current, + .flush_tlb_gva = svm_flush_tlb_gva, + .flush_tlb_guest = svm_flush_tlb_current, .vcpu_pre_run = svm_vcpu_pre_run, - .run = svm_vcpu_run, - .handle_exit = handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, + .vcpu_run = svm_vcpu_run, + .handle_exit = svm_handle_exit, + .skip_emulated_instruction = svm_skip_emulated_instruction, .update_emulated_instruction = NULL, .set_interrupt_shadow = svm_set_interrupt_shadow, .get_interrupt_shadow = svm_get_interrupt_shadow, .patch_hypercall = svm_patch_hypercall, - .set_irq = svm_set_irq, - .set_nmi = svm_inject_nmi, + .inject_irq = svm_inject_irq, + .inject_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, @@ -4525,17 +4574,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, - .set_virtual_apic_mode = svm_set_virtual_apic_mode, - .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, - .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, - .load_eoi_exitmap = svm_load_eoi_exitmap, - .hwapic_irr_update = svm_hwapic_irr_update, - .hwapic_isr_update = svm_hwapic_isr_update, - .apicv_post_state_restore = avic_post_state_restore, - - .set_tss_addr = svm_set_tss_addr, - .set_identity_map_addr = svm_set_identity_map_addr, - .get_mt_mask = svm_get_mt_mask, + .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, + .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, + .apicv_post_state_restore = avic_apicv_post_state_restore, .get_exit_info = svm_get_exit_info, @@ -4561,8 +4602,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .nested_ops = &svm_nested_ops, .deliver_interrupt = svm_deliver_interrupt, - .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, - .update_pi_irte = svm_update_pi_irte, + .pi_update_irte = avic_pi_update_irte, .setup_mce = svm_setup_mce, .smi_allowed = svm_smi_allowed, @@ -4570,12 +4610,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, - .mem_enc_op = svm_mem_enc_op, - .mem_enc_reg_region = svm_register_enc_region, - .mem_enc_unreg_region = svm_unregister_enc_region, + .mem_enc_ioctl = sev_mem_enc_ioctl, + .mem_enc_register_region = sev_mem_enc_register_region, + .mem_enc_unregister_region = sev_mem_enc_unregister_region, - .vm_copy_enc_context_from = svm_vm_copy_asid_from, - .vm_move_enc_context_from = svm_vm_migrate_from, + .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, + .vm_move_enc_context_from = sev_vm_move_enc_context_from, .can_emulate_instruction = svm_can_emulate_instruction, @@ -4637,6 +4677,7 @@ static __init void svm_set_cpu_caps(void) /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { kvm_cpu_cap_set(X86_FEATURE_SVM); + kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); if (nrips) kvm_cpu_cap_set(X86_FEATURE_NRIPS); @@ -4819,7 +4860,7 @@ static __init int svm_hardware_setup(void) return 0; err: - svm_hardware_teardown(); + svm_hardware_unsetup(); return r; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 73525353e424..70850cbe5bcb 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -79,7 +79,8 @@ struct kvm_sev_info { struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ - unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */ + struct list_head mirror_vms; /* List of VMs mirroring */ + struct list_head mirror_entry; /* Use as a list entry of mirrors */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; }; @@ -137,6 +138,8 @@ struct vmcb_ctrl_area_cached { u32 event_inj_err; u64 nested_cr3; u64 virt_ext; + u32 clean; + u8 reserved_sw[32]; }; struct svm_nested_state { @@ -163,6 +166,15 @@ struct svm_nested_state { struct vmcb_save_area_cached save; bool initialized; + + /* + * Indicates whether MSR bitmap for L2 needs to be rebuilt due to + * changes in MSR bitmap for L1 or switching to a different L2. Note, + * this flag can only be used reliably in conjunction with a paravirt L1 + * which informs L0 whether any changes to MSR bitmap for L2 were done + * on its side. + */ + bool force_msr_bitmap_recalc; }; struct vcpu_sev_es_state { @@ -321,7 +333,7 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) /* * Only the PDPTRs are loaded on demand into the shadow MMU. All other - * fields are synchronized in handle_exit, because accessing the VMCB is cheap. + * fields are synchronized on VM-Exit, because accessing the VMCB is cheap. * * CR3 might be out of date in the VMCB but it is not marked dirty; instead, * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3 @@ -480,7 +492,6 @@ void svm_vcpu_free_msrpm(u32 *msrpm); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -void svm_flush_tlb(struct kvm_vcpu *vcpu); void disable_nmi_singlestep(struct vcpu_svm *svm); bool svm_smi_blocked(struct kvm_vcpu *vcpu); bool svm_nmi_blocked(struct kvm_vcpu *vcpu); @@ -489,6 +500,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); +void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, + int trig_mode, int vec); /* nested.c */ @@ -556,17 +569,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ -#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) -#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 -#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) - -#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) -#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) -#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) -#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) - -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL - int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -576,19 +578,18 @@ int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); -void avic_post_state_restore(struct kvm_vcpu *vcpu); -void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -bool svm_check_apicv_inhibit_reasons(ulong bit); -void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); -void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); -void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec); -bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); -int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); +void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); +bool avic_check_apicv_inhibit_reasons(ulong bit); +void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); +void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); +bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); +int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); +void avic_ring_doorbell(struct kvm_vcpu *vcpu); /* sev.c */ @@ -599,17 +600,17 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); extern unsigned int max_sev_asid; void sev_vm_destroy(struct kvm *kvm); -int svm_mem_enc_op(struct kvm *kvm, void __user *argp); -int svm_register_enc_region(struct kvm *kvm, - struct kvm_enc_region *range); -int svm_unregister_enc_region(struct kvm *kvm, - struct kvm_enc_region *range); -int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd); -int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd); +int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp); +int sev_mem_enc_register_region(struct kvm *kvm, + struct kvm_enc_region *range); +int sev_mem_enc_unregister_region(struct kvm *kvm, + struct kvm_enc_region *range); +int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd); +int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd); void pre_sev_run(struct vcpu_svm *svm, int cpu); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); -void sev_hardware_teardown(void); +void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); void sev_free_vcpu(struct kvm_vcpu *vcpu); int sev_handle_vmgexit(struct kvm_vcpu *vcpu); @@ -617,7 +618,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); -void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); +void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa); void sev_es_unmap_ghcb(struct vcpu_svm *svm); /* vmenter.S */ diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 489ca56212c6..e2fc59380465 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -7,35 +7,12 @@ #define __ARCH_X86_KVM_SVM_ONHYPERV_H__ #if IS_ENABLED(CONFIG_HYPERV) -#include <asm/mshyperv.h> -#include "hyperv.h" #include "kvm_onhyperv.h" +#include "svm/hyperv.h" static struct kvm_x86_ops svm_x86_ops; -/* - * Hyper-V uses the software reserved 32 bytes in VMCB - * control area to expose SVM enlightenments to guests. - */ -struct hv_enlightenments { - struct __packed hv_enlightenments_control { - u32 nested_flush_hypercall:1; - u32 msr_bitmap:1; - u32 enlightened_npt_tlb: 1; - u32 reserved:29; - } __packed hv_enlightenments_control; - u32 hv_vp_id; - u64 hv_vm_id; - u64 partition_assist_page; - u64 reserved; -} __packed; - -/* - * Hyper-V uses the software reserved clean bit in VMCB - */ -#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW - int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu); static inline void svm_hv_init_vmcb(struct vmcb *vmcb) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 92e6f6702f00..e5a8c271b42d 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -64,9 +64,9 @@ TRACE_EVENT(kvm_hypercall, * Tracepoint for hypercall. */ TRACE_EVENT(kvm_hv_hypercall, - TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, - __u64 ingpa, __u64 outgpa), - TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + TP_PROTO(__u16 code, bool fast, __u16 var_cnt, __u16 rep_cnt, + __u16 rep_idx, __u64 ingpa, __u64 outgpa), + TP_ARGS(code, fast, var_cnt, rep_cnt, rep_idx, ingpa, outgpa), TP_STRUCT__entry( __field( __u16, rep_cnt ) @@ -74,6 +74,7 @@ TRACE_EVENT(kvm_hv_hypercall, __field( __u64, ingpa ) __field( __u64, outgpa ) __field( __u16, code ) + __field( __u16, var_cnt ) __field( bool, fast ) ), @@ -83,13 +84,14 @@ TRACE_EVENT(kvm_hv_hypercall, __entry->ingpa = ingpa; __entry->outgpa = outgpa; __entry->code = code; + __entry->var_cnt = var_cnt; __entry->fast = fast; ), - TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", + TP_printk("code 0x%x %s var_cnt 0x%x rep_cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", __entry->code, __entry->fast ? "fast" : "slow", - __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, - __entry->outgpa) + __entry->var_cnt, __entry->rep_cnt, __entry->rep_idx, + __entry->ingpa, __entry->outgpa) ); TRACE_EVENT(kvm_hv_hypercall_done, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ba34e94049c7..c73e4d938ddc 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4797,7 +4797,8 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } -void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, + bool vcpu_has_perf_global_ctrl) { struct vcpu_vmx *vmx; @@ -4805,7 +4806,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) return; vmx = to_vmx(vcpu); - if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { + if (vcpu_has_perf_global_ctrl) { vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; vmx->nested.msrs.exit_ctls_high |= diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index b69a80f43b37..c92cea0b8ccc 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -32,7 +32,8 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); -void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); +void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, + bool vcpu_has_perf_global_ctrl); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 466d18fc0c5d..03fab48b149c 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -541,7 +541,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - nested_vmx_pmu_entry_exit_ctls_update(vcpu); + nested_vmx_pmu_refresh(vcpu, + intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); if (intel_pmu_lbr_is_compatible(vcpu)) x86_perf_get_lbr(&lbr_desc->records); diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index aa1fe9085d77..3834bb30ce54 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -244,7 +244,7 @@ void vmx_pi_start_assignment(struct kvm *kvm) } /* - * pi_update_irte - set IRTE for Posted-Interrupts + * vmx_pi_update_irte - set IRTE for Posted-Interrupts * * @kvm: kvm * @host_irq: host irq of the interrupt @@ -252,8 +252,8 @@ void vmx_pi_start_assignment(struct kvm *kvm) * @set: set or unset PI * returns 0 on success, < 0 on failure */ -int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, - bool set) +int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index eb14e76b84ef..9a45d5c9f116 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -97,8 +97,8 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); -int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, - bool set); +int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); void vmx_pi_start_assignment(struct kvm *kvm); #endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6c27bd0c89e1..d8547144d3b7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -541,11 +541,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) return flexpriority_enabled && lapic_in_kernel(vcpu); } -static inline bool report_flexpriority(void) -{ - return flexpriority_enabled; -} - static int possible_passthrough_msr_slot(u32 msr) { u32 i; @@ -2341,7 +2336,7 @@ fault: return -EFAULT; } -static int hardware_enable(void) +static int vmx_hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2382,7 +2377,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -static void hardware_disable(void) +static void vmx_hardware_disable(void) { vmclear_local_loaded_vmcss(); @@ -3937,31 +3932,33 @@ static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { /* - * The vector of interrupt to be delivered to vcpu had - * been set in PIR before this function. + * The vector of the virtual has already been set in the PIR. + * Send a notification event to deliver the virtual interrupt + * unless the vCPU is the currently running vCPU, i.e. the + * event is being sent from a fastpath VM-Exit handler, in + * which case the PIR will be synced to the vIRR before + * re-entering the guest. * - * Following cases will be reached in this block, and - * we always send a notification event in all cases as - * explained below. + * When the target is not the running vCPU, the following + * possibilities emerge: * - * Case 1: vcpu keeps in non-root mode. Sending a - * notification event posts the interrupt to vcpu. + * Case 1: vCPU stays in non-root mode. Sending a notification + * event posts the interrupt to the vCPU. * - * Case 2: vcpu exits to root mode and is still - * runnable. PIR will be synced to vIRR before the - * next vcpu entry. Sending a notification event in - * this case has no effect, as vcpu is not in root - * mode. + * Case 2: vCPU exits to root mode and is still runnable. The + * PIR will be synced to the vIRR before re-entering the guest. + * Sending a notification event is ok as the host IRQ handler + * will ignore the spurious event. * - * Case 3: vcpu exits to root mode and is blocked. - * vcpu_block() has already synced PIR to vIRR and - * never blocks vcpu if vIRR is not cleared. Therefore, - * a blocked vcpu here does not wait for any requested - * interrupts in PIR, and sending a notification event - * which has no effect is safe here. + * Case 3: vCPU exits to root mode and is blocked. vcpu_block() + * has already synced PIR to vIRR and never blocks the vCPU if + * the vIRR is not empty. Therefore, a blocked vCPU here does + * not wait for any requested interrupts in PIR, and sending a + * notification event also results in a benign, spurious event. */ - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); + if (vcpu != kvm_get_running_vcpu()) + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return; } #endif @@ -5184,7 +5181,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!kvm_require_dr(vcpu, dr)) return 1; - if (kvm_x86_ops.get_cpl(vcpu) > 0) + if (vmx_get_cpl(vcpu) > 0) goto out; dr7 = vmcs_readl(GUEST_DR7); @@ -6969,7 +6966,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) return vmx_exit_handlers_fastpath(vcpu); } -static void vmx_free_vcpu(struct kvm_vcpu *vcpu) +static void vmx_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6980,7 +6977,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_loaded_vmcs(vmx->loaded_vmcs); } -static int vmx_create_vcpu(struct kvm_vcpu *vcpu) +static int vmx_vcpu_create(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; @@ -7342,11 +7339,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_secondary_exec_control(vmx)); if (nested_vmx_allowed(vcpu)) - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; else - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + vmx->msr_ia32_feature_control_valid_bits &= ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); @@ -7659,6 +7656,7 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (ret) return ret; + vmx->nested.nested_run_pending = 1; vmx->nested.smm.guest_mode = false; } return 0; @@ -7684,7 +7682,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu) } } -static void hardware_unsetup(void) +static void vmx_hardware_unsetup(void) { kvm_set_posted_intr_wakeup_handler(NULL); @@ -7707,21 +7705,20 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit) static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = "kvm_intel", - .hardware_unsetup = hardware_unsetup, + .hardware_unsetup = vmx_hardware_unsetup, - .hardware_enable = hardware_enable, - .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = report_flexpriority, + .hardware_enable = vmx_hardware_enable, + .hardware_disable = vmx_hardware_disable, .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, - .vcpu_create = vmx_create_vcpu, - .vcpu_free = vmx_free_vcpu, + .vcpu_create = vmx_vcpu_create, + .vcpu_free = vmx_vcpu_free, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_prepare_switch_to_guest, + .prepare_switch_to_guest = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, @@ -7749,21 +7746,21 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_rflags = vmx_set_rflags, .get_if_flag = vmx_get_if_flag, - .tlb_flush_all = vmx_flush_tlb_all, - .tlb_flush_current = vmx_flush_tlb_current, - .tlb_flush_gva = vmx_flush_tlb_gva, - .tlb_flush_guest = vmx_flush_tlb_guest, + .flush_tlb_all = vmx_flush_tlb_all, + .flush_tlb_current = vmx_flush_tlb_current, + .flush_tlb_gva = vmx_flush_tlb_gva, + .flush_tlb_guest = vmx_flush_tlb_guest, .vcpu_pre_run = vmx_vcpu_pre_run, - .run = vmx_vcpu_run, + .vcpu_run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, - .set_irq = vmx_inject_irq, - .set_nmi = vmx_inject_nmi, + .inject_irq = vmx_inject_irq, + .inject_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, @@ -7816,8 +7813,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, - .update_pi_irte = pi_update_irte, - .start_assignment = vmx_pi_start_assignment, + .pi_update_irte = vmx_pi_update_irte, + .pi_start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7131d735b1ef..16d29d41908f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -126,16 +126,15 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); struct kvm_x86_ops kvm_x86_ops __read_mostly; -EXPORT_SYMBOL_GPL(kvm_x86_ops); #define KVM_X86_OP(func) \ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ *(((struct kvm_x86_ops *)0)->func)); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); -EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); @@ -194,6 +193,9 @@ bool __read_mostly enable_pmu = true; EXPORT_SYMBOL_GPL(enable_pmu); module_param(enable_pmu, bool, 0444); +bool __read_mostly eager_page_split = true; +module_param(eager_page_split, bool, 0644); + /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU @@ -2399,7 +2401,7 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc) return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio) +u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; @@ -2414,7 +2416,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); + tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); return target_tsc - tsc; } @@ -2422,7 +2424,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { return vcpu->arch.l1_tsc_offset + - kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio); + kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); @@ -2625,7 +2627,7 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc(vcpu, (u64) adjustment, + adjustment = kvm_scale_tsc((u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); adjust_tsc_offset_guest(vcpu, adjustment); } @@ -3045,7 +3047,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ if (kvm_has_tsc_control) - tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz, + tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { @@ -3268,7 +3270,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_all)(vcpu); + static_call(kvm_x86_flush_tlb_all)(vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) @@ -3286,14 +3288,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) kvm_mmu_sync_prev_roots(vcpu); } - static_call(kvm_x86_tlb_flush_guest)(vcpu); + static_call(kvm_x86_flush_tlb_guest)(vcpu); } static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_current)(vcpu); + static_call(kvm_x86_flush_tlb_current)(vcpu); } /* @@ -3857,7 +3859,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ratio = vcpu->arch.tsc_scaling_ratio; } - msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset; + msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; break; } case MSR_MTRRcap: @@ -4233,6 +4235,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_SYS_ATTRIBUTES: + case KVM_CAP_VAPIC: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4273,9 +4276,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; - case KVM_CAP_VAPIC: - r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); - break; case KVM_CAP_NR_VCPUS: r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; @@ -5132,7 +5132,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz && kvm->arch.last_tsc_offset == offset); - tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; + tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); @@ -5977,15 +5977,18 @@ split_irqchip_unlock: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: r = -EINVAL; - if (kvm_x86_ops.vm_copy_enc_context_from) - r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); - return r; + if (!kvm_x86_ops.vm_copy_enc_context_from) + break; + + r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]); + break; case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: r = -EINVAL; - if (kvm_x86_ops.vm_move_enc_context_from) - r = kvm_x86_ops.vm_move_enc_context_from( - kvm, cap->args[0]); - return r; + if (!kvm_x86_ops.vm_move_enc_context_from) + break; + + r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]); + break; case KVM_CAP_EXIT_HYPERCALL: if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { r = -EINVAL; @@ -6460,8 +6463,10 @@ set_pit2_out: break; case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; - if (kvm_x86_ops.mem_enc_op) - r = static_call(kvm_x86_mem_enc_op)(kvm, argp); + if (!kvm_x86_ops.mem_enc_ioctl) + goto out; + + r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -6472,8 +6477,10 @@ set_pit2_out: goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_reg_region) - r = static_call(kvm_x86_mem_enc_reg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_register_region) + goto out; + + r = static_call(kvm_x86_mem_enc_register_region)(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -6484,8 +6491,10 @@ set_pit2_out: goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_unreg_region) - r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_unregister_region) + goto out; + + r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { @@ -8413,8 +8422,7 @@ writeback: kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); - if (kvm_x86_ops.update_emulated_instruction) - static_call(kvm_x86_update_emulated_instruction)(vcpu); + static_call_cond(kvm_x86_update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -8965,7 +8973,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, * * @apicid - apicid of vcpu to be kicked. */ -static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) +static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid) { struct kvm_lapic_irq lapic_irq; @@ -9084,7 +9092,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) break; - kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + kvm_pv_kick_cpu_op(vcpu->kvm, a1); kvm_sched_yield(vcpu, a1); ret = 0; break; @@ -9254,10 +9262,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) */ else if (!vcpu->arch.exception.pending) { if (vcpu->arch.nmi_injected) { - static_call(kvm_x86_set_nmi)(vcpu); + static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_set_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu); can_inject = false; } } @@ -9337,7 +9345,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - static_call(kvm_x86_set_nmi)(vcpu); + static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); } @@ -9351,7 +9359,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_set_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu); WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) @@ -9679,8 +9687,7 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) lockdep_assert_held_write(&kvm->arch.apicv_update_lock); - if (!kvm_x86_ops.check_apicv_inhibit_reasons || - !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) + if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; old = new = kvm->arch.apicv_inhibit_reasons; @@ -9713,10 +9720,12 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) } else kvm->arch.apicv_inhibit_reasons = new; } -EXPORT_SYMBOL_GPL(__kvm_request_apicv_update); void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { + if (!enable_apicv) + return; + down_write(&kvm->arch.apicv_update_lock); __kvm_request_apicv_update(kvm, activate, bit); up_write(&kvm->arch.apicv_update_lock); @@ -9755,11 +9764,11 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, to_hv_synic(vcpu)->vec_bitmap, 256); - static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); return; } - static_call(kvm_x86_load_eoi_exitmap)( + static_call_cond(kvm_x86_load_eoi_exitmap)( vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } @@ -9782,10 +9791,7 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (!kvm_x86_ops.set_apic_access_page_addr) - return; - - static_call(kvm_x86_set_apic_access_page_addr)(vcpu); + static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -9975,7 +9981,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - static_call(kvm_x86_prepare_guest_switch)(vcpu); + static_call(kvm_x86_prepare_switch_to_guest)(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -9983,7 +9989,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * result in virtual interrupt delivery. */ local_irq_disable(); - vcpu->mode = IN_GUEST_MODE; + + /* Store vcpu->apicv_active before vcpu->mode. */ + smp_store_release(&vcpu->mode, IN_GUEST_MODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -10054,7 +10062,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); - exit_fastpath = static_call(kvm_x86_run)(vcpu); + exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; @@ -10357,10 +10365,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - /* - * Exclude PKRU from restore as restored separately in - * kvm_x86_ops.run(). - */ + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); } @@ -10543,16 +10548,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - struct kvm_segment cs; - - kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); - *db = cs.db; - *l = cs.l; -} -EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); - static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct desc_ptr dt; @@ -11975,6 +11970,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (kvm_dirty_log_manual_protect_and_init_set(kvm)) return; + if (READ_ONCE(eager_page_split)) + kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); + if (kvm_x86_ops.cpu_dirty_log_size) { kvm_mmu_slot_leaf_clear_dirty(kvm, new); kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); @@ -12019,8 +12017,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && - kvm_x86_ops.guest_apic_has_interrupt && - static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); + static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -12375,7 +12372,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) - static_call_cond(kvm_x86_start_assignment)(kvm); + static_call_cond(kvm_x86_pi_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); @@ -12423,7 +12420,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) @@ -12448,7 +12445,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); @@ -12459,7 +12456,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); + return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set); } bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 767ec7f99516..aa86abad914d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -307,6 +307,8 @@ extern int pi_inject_timer; extern bool report_ignored_msrs; +extern bool eager_page_split; + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index bad57535fad0..4aa0f2b31665 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -133,32 +133,57 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) { struct kvm_vcpu_xen *vx = &v->arch.xen; + struct gfn_to_hva_cache *ghc = &vx->runstate_cache; + struct kvm_memslots *slots = kvm_memslots(v->kvm); + bool atomic = (state == RUNSTATE_runnable); uint64_t state_entry_time; - unsigned int offset; + int __user *user_state; + uint64_t __user *user_times; kvm_xen_update_runstate(v, state); if (!vx->runstate_set) return; - BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); + if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) && + kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len)) + return; + + /* We made sure it fits in a single page */ + BUG_ON(!ghc->memslot); + + if (atomic) + pagefault_disable(); - offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time); -#ifdef CONFIG_X86_64 /* - * The only difference is alignment of uint64_t in 32-bit. - * So the first field 'state' is accessed directly using - * offsetof() (where its offset happens to be zero), while the - * remaining fields which are all uint64_t, start at 'offset' - * which we tweak here by adding 4. + * The only difference between 32-bit and 64-bit versions of the + * runstate struct us the alignment of uint64_t in 32-bit, which + * means that the 64-bit version has an additional 4 bytes of + * padding after the first field 'state'. + * + * So we use 'int __user *user_state' to point to the state field, + * and 'uint64_t __user *user_times' for runstate_entry_time. So + * the actual array of time[] in each state starts at user_times[1]. */ + BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); + BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); + user_state = (int __user *)ghc->hva; + + BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); + + user_times = (uint64_t __user *)(ghc->hva + + offsetof(struct compat_vcpu_runstate_info, + state_entry_time)); +#ifdef CONFIG_X86_64 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != offsetof(struct compat_vcpu_runstate_info, time) + 4); if (v->kvm->arch.xen.long_mode) - offset = offsetof(struct vcpu_runstate_info, state_entry_time); + user_times = (uint64_t __user *)(ghc->hva + + offsetof(struct vcpu_runstate_info, + state_entry_time)); #endif /* * First write the updated state_entry_time at the appropriate @@ -172,10 +197,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != sizeof(state_entry_time)); - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &state_entry_time, offset, - sizeof(state_entry_time))) - return; + if (__put_user(state_entry_time, user_times)) + goto out; smp_wmb(); /* @@ -189,11 +212,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &vx->current_runstate, - offsetof(struct vcpu_runstate_info, state), - sizeof(vx->current_runstate))) - return; + if (__put_user(vx->current_runstate, user_state)) + goto out; /* * Write the actual runstate times immediately after the @@ -208,24 +228,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &vx->runstate_times[0], - offset + sizeof(u64), - sizeof(vx->runstate_times))) - return; - + if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times))) + goto out; smp_wmb(); /* * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's * runstate_entry_time field. */ - state_entry_time &= ~XEN_RUNSTATE_UPDATE; - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &state_entry_time, offset, - sizeof(state_entry_time))) - return; + __put_user(state_entry_time, user_times); + smp_wmb(); + + out: + mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + + if (atomic) + pagefault_enable(); } int __kvm_xen_has_interrupt(struct kvm_vcpu *v) @@ -443,6 +462,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } + /* It must fit within a single page */ + if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) { + r = -EINVAL; + break; + } + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache, data->u.gpa, @@ -460,6 +485,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } + /* It must fit within a single page */ + if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) { + r = -EINVAL; + break; + } + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache, data->u.gpa, @@ -481,6 +512,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } + /* It must fit within a single page */ + if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) { + r = -EINVAL; + break; + } + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.xen.runstate_cache, data->u.gpa, @@ -695,7 +732,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) instructions[0] = 0xb8; /* vmcall / vmmcall */ - kvm_x86_ops.patch_hypercall(vcpu, instructions + 5); + static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5); /* ret */ instructions[8] = 0xc3; @@ -830,7 +867,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_XEN; vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; vcpu->run->xen.u.hcall.longmode = longmode; - vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu); + vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu); vcpu->run->xen.u.hcall.input = input; vcpu->run->xen.u.hcall.params[0] = params[0]; vcpu->run->xen.u.hcall.params[1] = params[1]; diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index 8f97c2927bee..fdce7a4cfc6f 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -183,11 +183,18 @@ enum HV_GENERIC_SET_FORMAT { #define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) #define HV_HYPERCALL_FAST_BIT BIT(16) #define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_VARHEAD_MASK GENMASK_ULL(26, 17) +#define HV_HYPERCALL_RSVD0_MASK GENMASK_ULL(31, 27) #define HV_HYPERCALL_REP_COMP_OFFSET 32 #define HV_HYPERCALL_REP_COMP_1 BIT_ULL(32) #define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_RSVD1_MASK GENMASK_ULL(47, 44) #define HV_HYPERCALL_REP_START_OFFSET 48 #define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) +#define HV_HYPERCALL_RSVD2_MASK GENMASK_ULL(63, 60) +#define HV_HYPERCALL_RSVD_MASK (HV_HYPERCALL_RSVD0_MASK | \ + HV_HYPERCALL_RSVD1_MASK | \ + HV_HYPERCALL_RSVD2_MASK) /* hypercall status code */ #define HV_STATUS_SUCCESS 0 diff --git a/kernel/static_call.c b/kernel/static_call.c index 43ba0b1e0edb..f2b8baea35d2 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -503,6 +503,7 @@ long __static_call_return0(void) { return 0; } +EXPORT_SYMBOL_GPL(__static_call_return0); #ifdef CONFIG_STATIC_CALL_SELFTEST diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 086f490e808d..f9943b96ab3f 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -51,6 +51,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test @@ -82,7 +83,6 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test -TEST_GEN_PROGS_x86_64 += x86_64/vmx_pi_mmio_test TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests TEST_GEN_PROGS_x86_64 += x86_64/amx_test TEST_GEN_PROGS_x86_64 += access_tracking_perf_test diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 1954b964d1cf..101759ac93a4 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -298,12 +298,18 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-i iterations] [-p offset] " + printf("usage: %s [-h] [-i iterations] [-p offset] [-g]" "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]" "[-x memslots]\n", name); puts(""); printf(" -i: specify iteration counts (default: %"PRIu64")\n", TEST_HOST_LOOP_N); + printf(" -g: Do not enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2. This\n" + " makes KVM_GET_DIRTY_LOG clear the dirty log (i.e.\n" + " KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE is not enabled)\n" + " and writes will be tracked as soon as dirty logging is\n" + " enabled on the memslot (i.e. KVM_DIRTY_LOG_INITIALLY_SET\n" + " is not enabled).\n"); printf(" -p: specify guest physical test memory offset\n" " Warning: a low offset can conflict with the loaded test code.\n"); guest_modes_help(); @@ -343,8 +349,11 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:x:")) != -1) { + while ((opt = getopt(argc, argv, "ghi:p:m:b:f:v:os:x:")) != -1) { switch (opt) { + case 'g': + dirty_log_manual_caps = 0; + break; case 'i': p.iterations = atoi(optarg); break; diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index c9af97abd622..cc5d14a45702 100644 --- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -213,6 +213,25 @@ struct hv_enlightened_vmcs { u64 padding64_6[7]; }; +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF + #define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 @@ -648,381 +667,507 @@ static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value) switch (encoding) { case GUEST_RIP: current_evmcs->guest_rip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case GUEST_RSP: current_evmcs->guest_rsp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC; break; case GUEST_RFLAGS: current_evmcs->guest_rflags = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC; break; case HOST_IA32_PAT: current_evmcs->host_ia32_pat = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_IA32_EFER: current_evmcs->host_ia32_efer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_CR0: current_evmcs->host_cr0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_CR3: current_evmcs->host_cr3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_CR4: current_evmcs->host_cr4 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_IA32_SYSENTER_ESP: current_evmcs->host_ia32_sysenter_esp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_IA32_SYSENTER_EIP: current_evmcs->host_ia32_sysenter_eip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_RIP: current_evmcs->host_rip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case IO_BITMAP_A: current_evmcs->io_bitmap_a = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP; break; case IO_BITMAP_B: current_evmcs->io_bitmap_b = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP; break; case MSR_BITMAP: current_evmcs->msr_bitmap = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; break; case GUEST_ES_BASE: current_evmcs->guest_es_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_CS_BASE: current_evmcs->guest_cs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_SS_BASE: current_evmcs->guest_ss_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_DS_BASE: current_evmcs->guest_ds_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_FS_BASE: current_evmcs->guest_fs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GS_BASE: current_evmcs->guest_gs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_LDTR_BASE: current_evmcs->guest_ldtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_TR_BASE: current_evmcs->guest_tr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GDTR_BASE: current_evmcs->guest_gdtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_IDTR_BASE: current_evmcs->guest_idtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case TSC_OFFSET: current_evmcs->tsc_offset = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; break; case VIRTUAL_APIC_PAGE_ADDR: current_evmcs->virtual_apic_page_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; break; case VMCS_LINK_POINTER: current_evmcs->vmcs_link_pointer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_IA32_DEBUGCTL: current_evmcs->guest_ia32_debugctl = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_IA32_PAT: current_evmcs->guest_ia32_pat = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_IA32_EFER: current_evmcs->guest_ia32_efer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_PDPTR0: current_evmcs->guest_pdptr0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_PDPTR1: current_evmcs->guest_pdptr1 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_PDPTR2: current_evmcs->guest_pdptr2 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_PDPTR3: current_evmcs->guest_pdptr3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_PENDING_DBG_EXCEPTIONS: current_evmcs->guest_pending_dbg_exceptions = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_SYSENTER_ESP: current_evmcs->guest_sysenter_esp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_SYSENTER_EIP: current_evmcs->guest_sysenter_eip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case CR0_GUEST_HOST_MASK: current_evmcs->cr0_guest_host_mask = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case CR4_GUEST_HOST_MASK: current_evmcs->cr4_guest_host_mask = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case CR0_READ_SHADOW: current_evmcs->cr0_read_shadow = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case CR4_READ_SHADOW: current_evmcs->cr4_read_shadow = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case GUEST_CR0: current_evmcs->guest_cr0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case GUEST_CR3: current_evmcs->guest_cr3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case GUEST_CR4: current_evmcs->guest_cr4 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case GUEST_DR7: current_evmcs->guest_dr7 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; break; case HOST_FS_BASE: current_evmcs->host_fs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case HOST_GS_BASE: current_evmcs->host_gs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case HOST_TR_BASE: current_evmcs->host_tr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case HOST_GDTR_BASE: current_evmcs->host_gdtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case HOST_IDTR_BASE: current_evmcs->host_idtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case HOST_RSP: current_evmcs->host_rsp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; break; case EPT_POINTER: current_evmcs->ept_pointer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT; break; case GUEST_BNDCFGS: current_evmcs->guest_bndcfgs = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case XSS_EXIT_BITMAP: current_evmcs->xss_exit_bitmap = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; break; case GUEST_PHYSICAL_ADDRESS: current_evmcs->guest_physical_address = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case EXIT_QUALIFICATION: current_evmcs->exit_qualification = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case GUEST_LINEAR_ADDRESS: current_evmcs->guest_linear_address = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VM_EXIT_MSR_STORE_ADDR: current_evmcs->vm_exit_msr_store_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case VM_EXIT_MSR_LOAD_ADDR: current_evmcs->vm_exit_msr_load_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case VM_ENTRY_MSR_LOAD_ADDR: current_evmcs->vm_entry_msr_load_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case CR3_TARGET_VALUE0: current_evmcs->cr3_target_value0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case CR3_TARGET_VALUE1: current_evmcs->cr3_target_value1 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case CR3_TARGET_VALUE2: current_evmcs->cr3_target_value2 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case CR3_TARGET_VALUE3: current_evmcs->cr3_target_value3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case TPR_THRESHOLD: current_evmcs->tpr_threshold = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case GUEST_INTERRUPTIBILITY_INFO: current_evmcs->guest_interruptibility_info = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC; break; case CPU_BASED_VM_EXEC_CONTROL: current_evmcs->cpu_based_vm_exec_control = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC; break; case EXCEPTION_BITMAP: current_evmcs->exception_bitmap = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN; break; case VM_ENTRY_CONTROLS: current_evmcs->vm_entry_controls = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY; break; case VM_ENTRY_INTR_INFO_FIELD: current_evmcs->vm_entry_intr_info_field = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT; break; case VM_ENTRY_EXCEPTION_ERROR_CODE: current_evmcs->vm_entry_exception_error_code = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT; break; case VM_ENTRY_INSTRUCTION_LEN: current_evmcs->vm_entry_instruction_len = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT; break; case HOST_IA32_SYSENTER_CS: current_evmcs->host_ia32_sysenter_cs = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case PIN_BASED_VM_EXEC_CONTROL: current_evmcs->pin_based_vm_exec_control = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1; break; case VM_EXIT_CONTROLS: current_evmcs->vm_exit_controls = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1; break; case SECONDARY_VM_EXEC_CONTROL: current_evmcs->secondary_vm_exec_control = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1; break; case GUEST_ES_LIMIT: current_evmcs->guest_es_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_CS_LIMIT: current_evmcs->guest_cs_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_SS_LIMIT: current_evmcs->guest_ss_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_DS_LIMIT: current_evmcs->guest_ds_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_FS_LIMIT: current_evmcs->guest_fs_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GS_LIMIT: current_evmcs->guest_gs_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_LDTR_LIMIT: current_evmcs->guest_ldtr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_TR_LIMIT: current_evmcs->guest_tr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GDTR_LIMIT: current_evmcs->guest_gdtr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_IDTR_LIMIT: current_evmcs->guest_idtr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_ES_AR_BYTES: current_evmcs->guest_es_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_CS_AR_BYTES: current_evmcs->guest_cs_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_SS_AR_BYTES: current_evmcs->guest_ss_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_DS_AR_BYTES: current_evmcs->guest_ds_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_FS_AR_BYTES: current_evmcs->guest_fs_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GS_AR_BYTES: current_evmcs->guest_gs_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_LDTR_AR_BYTES: current_evmcs->guest_ldtr_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_TR_AR_BYTES: current_evmcs->guest_tr_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_ACTIVITY_STATE: current_evmcs->guest_activity_state = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case GUEST_SYSENTER_CS: current_evmcs->guest_sysenter_cs = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; break; case VM_INSTRUCTION_ERROR: current_evmcs->vm_instruction_error = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VM_EXIT_REASON: current_evmcs->vm_exit_reason = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VM_EXIT_INTR_INFO: current_evmcs->vm_exit_intr_info = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VM_EXIT_INTR_ERROR_CODE: current_evmcs->vm_exit_intr_error_code = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case IDT_VECTORING_INFO_FIELD: current_evmcs->idt_vectoring_info_field = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case IDT_VECTORING_ERROR_CODE: current_evmcs->idt_vectoring_error_code = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VM_EXIT_INSTRUCTION_LEN: current_evmcs->vm_exit_instruction_len = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case VMX_INSTRUCTION_INFO: current_evmcs->vmx_instruction_info = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; break; case PAGE_FAULT_ERROR_CODE_MASK: current_evmcs->page_fault_error_code_mask = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case PAGE_FAULT_ERROR_CODE_MATCH: current_evmcs->page_fault_error_code_match = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case CR3_TARGET_COUNT: current_evmcs->cr3_target_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case VM_EXIT_MSR_STORE_COUNT: current_evmcs->vm_exit_msr_store_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case VM_EXIT_MSR_LOAD_COUNT: current_evmcs->vm_exit_msr_load_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case VM_ENTRY_MSR_LOAD_COUNT: current_evmcs->vm_entry_msr_load_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; break; case HOST_ES_SELECTOR: current_evmcs->host_es_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_CS_SELECTOR: current_evmcs->host_cs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_SS_SELECTOR: current_evmcs->host_ss_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_DS_SELECTOR: current_evmcs->host_ds_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_FS_SELECTOR: current_evmcs->host_fs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_GS_SELECTOR: current_evmcs->host_gs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case HOST_TR_SELECTOR: current_evmcs->host_tr_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; break; case GUEST_ES_SELECTOR: current_evmcs->guest_es_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_CS_SELECTOR: current_evmcs->guest_cs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_SS_SELECTOR: current_evmcs->guest_ss_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_DS_SELECTOR: current_evmcs->guest_ds_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_FS_SELECTOR: current_evmcs->guest_fs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_GS_SELECTOR: current_evmcs->guest_gs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_LDTR_SELECTOR: current_evmcs->guest_ldtr_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case GUEST_TR_SELECTOR: current_evmcs->guest_tr_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; break; case VIRTUAL_PROCESSOR_ID: current_evmcs->virtual_processor_id = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT; break; default: return 1; } @@ -1070,7 +1215,10 @@ static inline int evmcs_vmresume(void) { int ret; - current_evmcs->hv_clean_fields = 0; + /* HOST_RIP */ + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + /* HOST_RSP */ + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; __asm__ __volatile__("push %%rbp;" "push %%rcx;" diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h index f4ea2355dbc2..2225e5077350 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm.h @@ -99,7 +99,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u8 reserved_6[8]; /* Offset 0xe8 */ u64 avic_logical_id; /* Offset 0xf0 */ u64 avic_physical_id; /* Offset 0xf8 */ - u8 reserved_7[768]; + u8 reserved_7[8]; + u64 vmsa_pa; /* Used for an SEV-ES guest */ + u8 reserved_8[720]; + /* + * Offset 0x3e0, 32 bytes reserved + * for use by hypervisor/software. + */ + u8 reserved_sw[32]; }; diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h index 587fbe408b99..a25aabd8f5e7 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -16,6 +16,7 @@ #define CPUID_SVM_BIT 2 #define CPUID_SVM BIT_ULL(CPUID_SVM_BIT) +#define SVM_EXIT_MSR 0x07c #define SVM_EXIT_VMMCALL 0x081 struct svm_test_data { @@ -28,6 +29,11 @@ struct svm_test_data { struct vmcb_save_area *save_area; /* gva */ void *save_area_hva; uint64_t save_area_gpa; + + /* MSR-Bitmap */ + void *msr; /* gva */ + void *msr_hva; + uint64_t msr_gpa; }; struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva); diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 0ebc03ce079c..736ee4a23df6 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -43,6 +43,11 @@ vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva) svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area); svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area); + svm->msr = (void *)vm_vaddr_alloc_page(vm); + svm->msr_hva = addr_gva2hva(vm, (uintptr_t)svm->msr); + svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr); + memset(svm->msr_hva, 0, getpagesize()); + *p_svm_gva = svm_gva; return svm; } @@ -106,6 +111,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR); ctrl->intercept = (1ULL << INTERCEPT_VMRUN) | (1ULL << INTERCEPT_VMMCALL); + ctrl->msrpm_base_pa = svm->msr_gpa; vmcb->save.rip = (u64)guest_rip; vmcb->save.rsp = (u64)guest_rsp; diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 4c7841dfd481..d12e043aa2ee 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -10,6 +10,7 @@ #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> +#include <linux/bitmap.h> #include "test_util.h" @@ -32,6 +33,22 @@ static void guest_nmi_handler(struct ex_regs *regs) { } +/* Exits to L1 destroy GRPs! */ +static inline void rdmsr_fs_base(void) +{ + __asm__ __volatile__ ("mov $0xc0000100, %%rcx; rdmsr" : : : + "rax", "rbx", "rcx", "rdx", + "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15"); +} +static inline void rdmsr_gs_base(void) +{ + __asm__ __volatile__ ("mov $0xc0000101, %%rcx; rdmsr" : : : + "rax", "rbx", "rcx", "rdx", + "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15"); +} + void l2_guest_code(void) { GUEST_SYNC(7); @@ -41,6 +58,15 @@ void l2_guest_code(void) /* Forced exit to L1 upon restore */ GUEST_SYNC(9); + vmcall(); + + /* MSR-Bitmap tests */ + rdmsr_fs_base(); /* intercepted */ + rdmsr_fs_base(); /* intercepted */ + rdmsr_gs_base(); /* not intercepted */ + vmcall(); + rdmsr_gs_base(); /* intercepted */ + /* Done, exit to L1 and never come back. */ vmcall(); } @@ -76,8 +102,9 @@ void guest_code(struct vmx_pages *vmx_pages) current_evmcs->revision_id = EVMCS_VERSION; GUEST_SYNC(6); - current_evmcs->pin_based_vm_exec_control |= - PIN_BASED_NMI_EXITING; + vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmreadz(PIN_BASED_VM_EXEC_CONTROL) | + PIN_BASED_NMI_EXITING); + GUEST_ASSERT(!vmlaunch()); GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); @@ -91,6 +118,39 @@ void guest_code(struct vmx_pages *vmx_pages) GUEST_SYNC(10); GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + current_evmcs->guest_rip += 3; /* vmcall */ + + /* Intercept RDMSR 0xc0000100 */ + vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmreadz(CPU_BASED_VM_EXEC_CONTROL) | + CPU_BASED_USE_MSR_BITMAPS); + set_bit(MSR_FS_BASE & 0x1fff, vmx_pages->msr + 0x400); + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); + current_evmcs->guest_rip += 2; /* rdmsr */ + + /* Enable enlightened MSR bitmap */ + current_evmcs->hv_enlightenments_control.msr_bitmap = 1; + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); + current_evmcs->guest_rip += 2; /* rdmsr */ + + /* Intercept RDMSR 0xc0000101 without telling KVM about it */ + set_bit(MSR_GS_BASE & 0x1fff, vmx_pages->msr + 0x400); + /* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */ + current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + GUEST_ASSERT(!vmresume()); + /* Make sure we don't see EXIT_REASON_MSR_READ here so eMSR bitmap works */ + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + current_evmcs->guest_rip += 3; /* vmcall */ + + /* Now tell KVM we've changed MSR-Bitmap */ + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); + current_evmcs->guest_rip += 2; /* rdmsr */ + + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); GUEST_SYNC(11); /* Try enlightened vmptrld with an incorrect GPA */ diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 7e2d2d17d2ed..8c245ab2d98a 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -49,16 +49,13 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, bool evmcs_expected) { int i; - int nent = 9; + int nent_expected = 10; u32 test_val; - if (evmcs_expected) - nent += 1; /* 0x4000000A */ - - TEST_ASSERT(hv_cpuid_entries->nent == nent, + TEST_ASSERT(hv_cpuid_entries->nent == nent_expected, "KVM_GET_SUPPORTED_HV_CPUID should return %d entries" - " with evmcs=%d (returned %d)", - nent, evmcs_expected, hv_cpuid_entries->nent); + " (returned %d)", + nent_expected, hv_cpuid_entries->nent); for (i = 0; i < hv_cpuid_entries->nent; i++) { struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i]; @@ -68,9 +65,6 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, "function %x is our of supported range", entry->function); - TEST_ASSERT(evmcs_expected || (entry->function != 0x4000000A), - "0x4000000A leaf should not be reported"); - TEST_ASSERT(entry->index == 0, ".index field should be zero"); @@ -97,8 +91,20 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, "NoNonArchitecturalCoreSharing bit" " doesn't reflect SMT setting"); break; - } + case 0x4000000A: + TEST_ASSERT(entry->eax & (1UL << 19), + "Enlightened MSR-Bitmap should always be supported" + " 0x40000000.EAX: %x", entry->eax); + if (evmcs_expected) + TEST_ASSERT((entry->eax & 0xffff) == 0x101, + "Supported Enlightened VMCS version range is supposed to be 1:1" + " 0x40000000.EAX: %x", entry->eax); + + break; + default: + break; + } /* * If needed for debug: * fprintf(stdout, @@ -107,7 +113,6 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, * entry->edx); */ } - } void test_hv_cpuid_e2big(struct kvm_vm *vm, bool system) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c new file mode 100644 index 000000000000..21f5ca9197da --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM_GET/SET_* tests + * + * Copyright (C) 2022, Red Hat, Inc. + * + * Tests for Hyper-V extensions to SVM. + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <linux/bitmap.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "hyperv.h" + +#define VCPU_ID 1 +#define L2_GUEST_STACK_SIZE 256 + +struct hv_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB + */ +#define VMCB_HV_NESTED_ENLIGHTENMENTS (1U << 31) + +static inline void vmmcall(void) +{ + __asm__ __volatile__("vmmcall"); +} + +void l2_guest_code(void) +{ + GUEST_SYNC(3); + /* Exit to L1 */ + vmmcall(); + + /* MSR-Bitmap tests */ + rdmsr(MSR_FS_BASE); /* intercepted */ + rdmsr(MSR_FS_BASE); /* intercepted */ + rdmsr(MSR_GS_BASE); /* not intercepted */ + vmmcall(); + rdmsr(MSR_GS_BASE); /* intercepted */ + + GUEST_SYNC(5); + + /* Done, exit to L1 and never come back. */ + vmmcall(); +} + +static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + struct hv_enlightenments *hve = + (struct hv_enlightenments *)vmcb->control.reserved_sw; + + GUEST_SYNC(1); + + wrmsr(HV_X64_MSR_GUEST_OS_ID, (u64)0x8100 << 48); + + GUEST_ASSERT(svm->vmcb_gpa); + /* Prepare for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(2); + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(4); + vmcb->save.rip += 3; + + /* Intercept RDMSR 0xc0000100 */ + vmcb->control.intercept |= 1ULL << INTERCEPT_MSR_PROT; + set_bit(2 * (MSR_FS_BASE & 0x1fff), svm->msr + 0x800); + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); + vmcb->save.rip += 2; /* rdmsr */ + + /* Enable enlightened MSR bitmap */ + hve->hv_enlightenments_control.msr_bitmap = 1; + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); + vmcb->save.rip += 2; /* rdmsr */ + + /* Intercept RDMSR 0xc0000101 without telling KVM about it */ + set_bit(2 * (MSR_GS_BASE & 0x1fff), svm->msr + 0x800); + /* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */ + vmcb->control.clean |= VMCB_HV_NESTED_ENLIGHTENMENTS; + run_guest(vmcb, svm->vmcb_gpa); + /* Make sure we don't see SVM_EXIT_MSR here so eMSR bitmap works */ + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + vmcb->save.rip += 3; /* vmcall */ + + /* Now tell KVM we've changed MSR-Bitmap */ + vmcb->control.clean &= ~VMCB_HV_NESTED_ENLIGHTENMENTS; + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); + vmcb->save.rip += 2; /* rdmsr */ + + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(6); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_gva = 0; + + struct kvm_vm *vm; + struct kvm_run *run; + struct ucall uc; + int stage; + + if (!nested_svm_supported()) { + print_skip("Nested SVM not supported"); + exit(KSFT_SKIP); + } + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_hv_cpuid(vm, VCPU_ID); + run = vcpu_state(vm, VCPU_ID); + vcpu_alloc_svm(vm, &nested_gva); + vcpu_args_set(vm, VCPU_ID, 1, nested_gva); + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Stage %d: unexpected exit reason: %u (%s),\n", + stage, run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + /* UCALL_SYNC is handled here. */ + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + + } + +done: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index 80056bbbb003..d1dc1acf997c 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -21,6 +21,8 @@ #define NR_LOCK_TESTING_THREADS 3 #define NR_LOCK_TESTING_ITERATIONS 10000 +bool have_sev_es; + static int __sev_ioctl(int vm_fd, int cmd_id, void *data, __u32 *fw_error) { struct kvm_sev_cmd cmd = { @@ -172,10 +174,18 @@ static void test_sev_migrate_parameters(void) *sev_es_vm_no_vmsa; int ret; - sev_vm = sev_vm_create(/* es= */ false); - sev_es_vm = sev_vm_create(/* es= */ true); vm_no_vcpu = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); vm_no_sev = aux_vm_create(true); + ret = __sev_migrate_from(vm_no_vcpu->fd, vm_no_sev->fd); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "Migrations require SEV enabled. ret %d, errno: %d\n", ret, + errno); + + if (!have_sev_es) + goto out; + + sev_vm = sev_vm_create(/* es= */ false); + sev_es_vm = sev_vm_create(/* es= */ true); sev_es_vm_no_vmsa = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL); vm_vcpu_add(sev_es_vm_no_vmsa, 1); @@ -204,14 +214,10 @@ static void test_sev_migrate_parameters(void) "SEV-ES migrations require UPDATE_VMSA. ret %d, errno: %d\n", ret, errno); - ret = __sev_migrate_from(vm_no_vcpu->fd, vm_no_sev->fd); - TEST_ASSERT(ret == -1 && errno == EINVAL, - "Migrations require SEV enabled. ret %d, errno: %d\n", ret, - errno); - kvm_vm_free(sev_vm); kvm_vm_free(sev_es_vm); kvm_vm_free(sev_es_vm_no_vmsa); +out: kvm_vm_free(vm_no_vcpu); kvm_vm_free(vm_no_sev); } @@ -300,7 +306,6 @@ static void test_sev_mirror_parameters(void) int ret; sev_vm = sev_vm_create(/* es= */ false); - sev_es_vm = sev_vm_create(/* es= */ true); vm_with_vcpu = aux_vm_create(true); vm_no_vcpu = aux_vm_create(false); @@ -310,6 +315,21 @@ static void test_sev_mirror_parameters(void) "Should not be able copy context to self. ret: %d, errno: %d\n", ret, errno); + ret = __sev_mirror_create(vm_no_vcpu->fd, vm_with_vcpu->fd); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "Copy context requires SEV enabled. ret %d, errno: %d\n", ret, + errno); + + ret = __sev_mirror_create(vm_with_vcpu->fd, sev_vm->fd); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n", + ret, errno); + + if (!have_sev_es) + goto out; + + sev_es_vm = sev_vm_create(/* es= */ true); ret = __sev_mirror_create(sev_vm->fd, sev_es_vm->fd); TEST_ASSERT( ret == -1 && errno == EINVAL, @@ -322,63 +342,97 @@ static void test_sev_mirror_parameters(void) "Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d\n", ret, errno); - ret = __sev_mirror_create(vm_no_vcpu->fd, vm_with_vcpu->fd); - TEST_ASSERT(ret == -1 && errno == EINVAL, - "Copy context requires SEV enabled. ret %d, errno: %d\n", ret, - errno); - - ret = __sev_mirror_create(vm_with_vcpu->fd, sev_vm->fd); - TEST_ASSERT( - ret == -1 && errno == EINVAL, - "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n", - ret, errno); + kvm_vm_free(sev_es_vm); +out: kvm_vm_free(sev_vm); - kvm_vm_free(sev_es_vm); kvm_vm_free(vm_with_vcpu); kvm_vm_free(vm_no_vcpu); } static void test_sev_move_copy(void) { - struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm; - int ret; + struct kvm_vm *dst_vm, *dst2_vm, *dst3_vm, *sev_vm, *mirror_vm, + *dst_mirror_vm, *dst2_mirror_vm, *dst3_mirror_vm; sev_vm = sev_vm_create(/* es= */ false); dst_vm = aux_vm_create(true); + dst2_vm = aux_vm_create(true); + dst3_vm = aux_vm_create(true); mirror_vm = aux_vm_create(false); dst_mirror_vm = aux_vm_create(false); + dst2_mirror_vm = aux_vm_create(false); + dst3_mirror_vm = aux_vm_create(false); sev_mirror_create(mirror_vm->fd, sev_vm->fd); - ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd); - TEST_ASSERT(ret == -1 && errno == EBUSY, - "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret, - errno); - /* The mirror itself can be migrated. */ sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd); - ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd); - TEST_ASSERT(ret == -1 && errno == EBUSY, - "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret, - errno); + sev_migrate_from(dst_vm->fd, sev_vm->fd); + + sev_migrate_from(dst2_vm->fd, dst_vm->fd); + sev_migrate_from(dst2_mirror_vm->fd, dst_mirror_vm->fd); + + sev_migrate_from(dst3_mirror_vm->fd, dst2_mirror_vm->fd); + sev_migrate_from(dst3_vm->fd, dst2_vm->fd); + + kvm_vm_free(dst_vm); + kvm_vm_free(sev_vm); + kvm_vm_free(dst2_vm); + kvm_vm_free(dst3_vm); + kvm_vm_free(mirror_vm); + kvm_vm_free(dst_mirror_vm); + kvm_vm_free(dst2_mirror_vm); + kvm_vm_free(dst3_mirror_vm); /* - * mirror_vm is not a mirror anymore, dst_mirror_vm is. Thus, - * the owner can be copied as soon as dst_mirror_vm is gone. + * Run similar test be destroy mirrors before mirrored VMs to ensure + * destruction is done safely. */ - kvm_vm_free(dst_mirror_vm); + sev_vm = sev_vm_create(/* es= */ false); + dst_vm = aux_vm_create(true); + mirror_vm = aux_vm_create(false); + dst_mirror_vm = aux_vm_create(false); + + sev_mirror_create(mirror_vm->fd, sev_vm->fd); + + sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd); sev_migrate_from(dst_vm->fd, sev_vm->fd); kvm_vm_free(mirror_vm); + kvm_vm_free(dst_mirror_vm); kvm_vm_free(dst_vm); kvm_vm_free(sev_vm); } +#define X86_FEATURE_SEV (1 << 1) +#define X86_FEATURE_SEV_ES (1 << 3) + int main(int argc, char *argv[]) { + struct kvm_cpuid_entry2 *cpuid; + + if (!kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM) && + !kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { + print_skip("Capabilities not available"); + exit(KSFT_SKIP); + } + + cpuid = kvm_get_supported_cpuid_entry(0x80000000); + if (cpuid->eax < 0x8000001f) { + print_skip("AMD memory encryption not available"); + exit(KSFT_SKIP); + } + cpuid = kvm_get_supported_cpuid_entry(0x8000001f); + if (!(cpuid->eax & X86_FEATURE_SEV)) { + print_skip("AMD SEV not available"); + exit(KSFT_SKIP); + } + have_sev_es = !!(cpuid->eax & X86_FEATURE_SEV_ES); + if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) { test_sev_migrate_from(/* es= */ false); - test_sev_migrate_from(/* es= */ true); + if (have_sev_es) + test_sev_migrate_from(/* es= */ true); test_sev_migrate_locking(); test_sev_migrate_parameters(); if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) @@ -386,7 +440,8 @@ int main(int argc, char *argv[]) } if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { test_sev_mirror(/* es= */ false); - test_sev_mirror(/* es= */ true); + if (have_sev_es) + test_sev_mirror(/* es= */ true); test_sev_mirror_parameters(); } return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 58d31da8a2f7..83c57bcc6eb6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -246,9 +246,8 @@ static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait) return true; } -static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int req, struct cpumask *tmp, - int current_cpu) +static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req, + struct cpumask *tmp, int current_cpu) { int cpu; @@ -291,7 +290,7 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, vcpu = kvm_get_vcpu(kvm, i); if (!vcpu) continue; - kvm_make_vcpu_request(kvm, vcpu, req, cpus, me); + kvm_make_vcpu_request(vcpu, req, cpus, me); } called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); @@ -317,7 +316,7 @@ bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu == except) continue; - kvm_make_vcpu_request(kvm, vcpu, req, cpus, me); + kvm_make_vcpu_request(vcpu, req, cpus, me); } called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); |