diff options
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r-- | arch/x86/kvm/vmx/capabilities.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/evmcs.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/evmcs.h | 48 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 283 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 63 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/posted_intr.c | 278 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/posted_intr.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs12.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs12.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmenter.S | 14 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 490 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 52 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx_ops.h | 43 |
14 files changed, 856 insertions, 462 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 4705ad55abb5..3f430e218375 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -5,6 +5,7 @@ #include <asm/vmx.h> #include "lapic.h" +#include "x86.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -53,7 +54,6 @@ struct nested_vmx_msrs { struct vmcs_config { int size; - int order; u32 basic_cap; u32 revision_id; u32 pin_based_exec_ctrl; @@ -312,6 +312,15 @@ static inline bool cpu_has_vmx_ept_1g_page(void) return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; } +static inline int ept_caps_to_lpage_level(u32 ept_caps) +{ + if (ept_caps & VMX_EPT_1GB_PAGE_BIT) + return PG_LEVEL_1G; + if (ept_caps & VMX_EPT_2MB_PAGE_BIT) + return PG_LEVEL_2M; + return PG_LEVEL_4K; +} + static inline bool cpu_has_vmx_ept_ad_bits(void) { return vmx_capability.ept & VMX_EPT_AD_BIT; @@ -380,6 +389,9 @@ static inline u64 vmx_get_perf_capabilities(void) { u64 perf_cap = 0; + if (!enable_pmu) + return perf_cap; + if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index ba6f99f584ac..87e3dc10edf4 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -12,8 +12,6 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs); -#if IS_ENABLED(CONFIG_HYPERV) - #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ {EVMCS1_OFFSET(name), clean_field} @@ -296,6 +294,7 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { }; const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); +#if IS_ENABLED(CONFIG_HYPERV) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; @@ -362,6 +361,7 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) case MSR_IA32_VMX_PROCBASED_CTLS2: ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; break; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; break; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 16731d2cf231..8d70f9aea94b 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -59,12 +59,12 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); SECONDARY_EXEC_SHADOW_VMCS | \ SECONDARY_EXEC_TSC_SCALING | \ SECONDARY_EXEC_PAUSE_LOOP_EXITING) -#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) +#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) #define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) -#if IS_ENABLED(CONFIG_HYPERV) - struct evmcs_field { u16 offset; u16 clean_field; @@ -73,26 +73,56 @@ struct evmcs_field { extern const struct evmcs_field vmcs_field_to_evmcs_1[]; extern const unsigned int nr_evmcs_1_fields; -static __always_inline int get_evmcs_offset(unsigned long field, - u16 *clean_field) +static __always_inline int evmcs_field_offset(unsigned long field, + u16 *clean_field) { unsigned int index = ROL16(field, 6); const struct evmcs_field *evmcs_field; - if (unlikely(index >= nr_evmcs_1_fields)) { - WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", - field); + if (unlikely(index >= nr_evmcs_1_fields)) return -ENOENT; - } evmcs_field = &vmcs_field_to_evmcs_1[index]; + /* + * Use offset=0 to detect holes in eVMCS. This offset belongs to + * 'revision_id' but this field has no encoding and is supposed to + * be accessed directly. + */ + if (unlikely(!evmcs_field->offset)) + return -ENOENT; + if (clean_field) *clean_field = evmcs_field->clean_field; return evmcs_field->offset; } +static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, + unsigned long field, u16 offset) +{ + /* + * vmcs12_read_any() doesn't care whether the supplied structure + * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes + * the exact offset of the required field, use it for convenience + * here. + */ + return vmcs12_read_any((void *)evmcs, field, offset); +} + +#if IS_ENABLED(CONFIG_HYPERV) + +static __always_inline int get_evmcs_offset(unsigned long field, + u16 *clean_field) +{ + int offset = evmcs_field_offset(field, clean_field); + + WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n", + field); + + return offset; +} + static __always_inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b213ca966d41..dc822a1d403d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7,6 +7,7 @@ #include <asm/mmu_context.h> #include "cpuid.h" +#include "evmcs.h" #include "hyperv.h" #include "mmu.h" #include "nested.h" @@ -269,7 +270,13 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); - vmx_register_cache_reset(vcpu); + vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; + + /* + * All lazily updated registers will be reloaded from VMCS12 on both + * vmentry and vmexit. + */ + vcpu->arch.regs_dirty = 0; } /* @@ -391,9 +398,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) { - kvm_init_shadow_ept_mmu(vcpu, - to_vmx(vcpu)->nested.msrs.ept_caps & - VMX_EPT_EXECUTE_ONLY_BIT, + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; + int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); + + kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, nested_ept_ad_enabled(vcpu), nested_ept_get_eptp(vcpu)); } @@ -591,6 +600,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ @@ -598,6 +608,19 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return false; + /* + * MSR bitmap update can be skipped when: + * - MSR bitmap for L1 hasn't changed. + * - Nested hypervisor (L1) is attempting to launch the same L2 as + * before. + * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature + * and tells KVM (L0) there were no changes in MSR bitmap for L2. + */ + if (!vmx->nested.force_msr_bitmap_recalc && evmcs && + evmcs->hv_enlightenments_control.msr_bitmap && + evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) + return true; + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) return false; @@ -664,39 +687,47 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); + vmx->nested.force_msr_bitmap_recalc = false; + return true; } static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - struct kvm_host_map map; - struct vmcs12 *shadow; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == INVALID_GPA) return; - shadow = get_shadow_vmcs12(vcpu); - - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) + if (ghc->gpa != vmcs12->vmcs_link_pointer && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - memcpy(shadow, map.hva, VMCS12_SIZE); - kvm_vcpu_unmap(vcpu, &map, false); + kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + VMCS12_SIZE); } static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == INVALID_GPA) return; - kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, - get_shadow_vmcs12(vcpu), VMCS12_SIZE); + if (ghc->gpa != vmcs12->vmcs_link_pointer && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE)) + return; + + kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + VMCS12_SIZE); } /* @@ -1089,7 +1120,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, * must not be dereferenced. */ if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && - CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { + CC(!load_pdptrs(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_PDPTE; return -EINVAL; } @@ -1098,7 +1129,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); @@ -1156,29 +1187,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, WARN_ON(!enable_vpid); /* - * If VPID is enabled and used by vmc12, but L2 does not have a unique - * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate - * a VPID for L2, flush the current context as the effective ASID is - * common to both L1 and L2. - * - * Defer the flush so that it runs after vmcs02.EPTP has been set by - * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid - * redundant flushes further down the nested pipeline. - * - * If a TLB flush isn't required due to any of the above, and vpid12 is - * changing then the new "virtual" VPID (vpid12) will reuse the same - * "real" VPID (vpid02), and so needs to be flushed. There's no direct - * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for - * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate - * guest-physical mappings, so there is no need to sync the nEPT MMU. + * VPID is enabled and in use by vmcs12. If vpid12 is changing, then + * emulate a guest TLB flush as KVM does not track vpid12 history nor + * is the VPID incorporated into the MMU context. I.e. KVM must assume + * that the new vpid12 has never been used and thus represents a new + * guest ASID that cannot have entries in the TLB. */ - if (!nested_has_guest_tlb_tag(vcpu)) { - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } else if (is_vmenter && - vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; - vpid_sync_context(nested_get_vpid02(vcpu)); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; } + + /* + * If VPID is enabled, used by vmc12, and vpid12 is not changing but + * does not have a unique TLB tag (ASID), i.e. EPT is disabled and + * KVM was unable to allocate a VPID for L2, flush the current context + * as the effective ASID is common to both L1 and L2. + */ + if (!nested_has_guest_tlb_tag(vcpu)) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) @@ -2018,10 +2046,13 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( * Clean fields data can't be used on VMLAUNCH and when we switch * between different L2 guests as KVM keeps a single VMCS12 per L1. */ - if (from_launch || evmcs_gpa_changed) + if (from_launch || evmcs_gpa_changed) { vmx->nested.hv_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + vmx->nested.force_msr_bitmap_recalc = true; + } + return EVMPTRLD_SUCCEEDED; } @@ -2588,8 +2619,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->guest_ia32_perf_global_ctrl))) + vmcs12->guest_ia32_perf_global_ctrl))) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; + } kvm_rsp_write(vcpu, vmcs12->guest_rsp); kvm_rip_write(vcpu, vmcs12->guest_rip); @@ -2830,6 +2863,17 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ +#ifdef CONFIG_X86_64 + if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != + !!(vcpu->arch.efer & EFER_LMA))) + return -EINVAL; +#endif + return 0; +} + static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -2854,18 +2898,16 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, return -EINVAL; #ifdef CONFIG_X86_64 - ia32e = !!(vcpu->arch.efer & EFER_LMA); + ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); #else ia32e = false; #endif if (ia32e) { - if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || - CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) + if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) return -EINVAL; } else { - if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || - CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || + if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || CC((vmcs12->host_rip) >> 32)) return -EINVAL; @@ -2910,9 +2952,9 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int r = 0; - struct vmcs12 *shadow; - struct kvm_host_map map; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; + struct vmcs_hdr hdr; if (vmcs12->vmcs_link_pointer == INVALID_GPA) return 0; @@ -2920,17 +2962,21 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) return -EINVAL; - if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) - return -EINVAL; + if (ghc->gpa != vmcs12->vmcs_link_pointer && + CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE))) + return -EINVAL; - shadow = map.hva; + if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, + offsetof(struct vmcs12, hdr), + sizeof(hdr)))) + return -EINVAL; - if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || - CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) - r = -EINVAL; + if (CC(hdr.revision_id != VMCS12_REVISION) || + CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) + return -EINVAL; - kvm_vcpu_unmap(vcpu, &map, false); - return r; + return 0; } /* @@ -3127,7 +3173,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * the guest CR3 might be restored prior to setting the nested * state which can lead to a load of wrong PDPTRs. */ - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; } @@ -3325,8 +3371,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, }; u32 failed_index; - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); + kvm_service_local_tlb_flush_requests(vcpu); evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); @@ -3487,10 +3532,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (evmptrld_status == EVMPTRLD_ERROR) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; - } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { - return nested_vmx_failInvalid(vcpu); } + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + + if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) + return nested_vmx_failInvalid(vcpu); + if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); @@ -3535,6 +3583,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (nested_vmx_check_controls(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + if (nested_vmx_check_address_space_size(vcpu, vmcs12)) + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); + if (nested_vmx_check_host_state(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); @@ -3583,7 +3634,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && (vmcs12->guest_rflags & X86_EFLAGS_IF))) { vmx->nested.nested_run_pending = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } break; case GUEST_ACTIVITY_WAIT_SIPI: @@ -4480,9 +4531,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, (void)nested_get_evmcs_page(vcpu); } - /* Service the TLB flush request for L2 before switching to L1. */ - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); + /* Service pending TLB flush requests for L2 before switching to L1. */ + kvm_service_local_tlb_flush_requests(vcpu); /* * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between @@ -4807,18 +4857,20 @@ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; /* - * We should allocate a shadow vmcs for vmcs01 only when L1 - * executes VMXON and free it when L1 executes VMXOFF. - * As it is invalid to execute VMXON twice, we shouldn't reach - * here when vmcs01 already have an allocated shadow vmcs. + * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it + * when L1 executes VMXOFF or the vCPU is forced out of nested + * operation. VMXON faults if the CPU is already post-VMXON, so it + * should be impossible to already have an allocated shadow VMCS. KVM + * doesn't support virtualization of VMCS shadowing, so vmcs01 should + * always be the loaded VMCS. */ - WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); + if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) + return loaded_vmcs->shadow_vmcs; + + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); + if (loaded_vmcs->shadow_vmcs) + vmcs_clear(loaded_vmcs->shadow_vmcs); - if (!loaded_vmcs->shadow_vmcs) { - loaded_vmcs->shadow_vmcs = alloc_vmcs(true); - if (loaded_vmcs->shadow_vmcs) - vmcs_clear(loaded_vmcs->shadow_vmcs); - } return loaded_vmcs->shadow_vmcs; } @@ -4835,6 +4887,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; + vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; @@ -5054,27 +5107,49 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - /* - * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, - * any VMREAD sets the ALU flags for VMfailInvalid. - */ - if (vmx->nested.current_vmptr == INVALID_GPA || - (is_guest_mode(vcpu) && - get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) - return nested_vmx_failInvalid(vcpu); - /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - offset = vmcs_field_to_offset(field); - if (offset < 0) - return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + /* + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, + * any VMREAD sets the ALU flags for VMfailInvalid. + */ + if (vmx->nested.current_vmptr == INVALID_GPA || + (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) + return nested_vmx_failInvalid(vcpu); - if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) - copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + offset = get_vmcs12_field_offset(field); + if (offset < 0) + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - /* Read the field, zero-extended to a u64 value */ - value = vmcs12_read_any(vmcs12, field, offset); + /* Read the field, zero-extended to a u64 value */ + value = vmcs12_read_any(vmcs12, field, offset); + } else { + /* + * Hyper-V TLFS (as of 6.0b) explicitly states, that while an + * enlightened VMCS is active VMREAD/VMWRITE instructions are + * unsupported. Unfortunately, certain versions of Windows 11 + * don't comply with this requirement which is not enforced in + * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a + * workaround, as misbehaving guests will panic on VM-Fail. + * Note, enlightened VMCS is incompatible with shadow VMCS so + * all VMREADs from L2 should go to L1. + */ + if (WARN_ON_ONCE(is_guest_mode(vcpu))) + return nested_vmx_failInvalid(vcpu); + + offset = evmcs_field_offset(field, NULL); + if (offset < 0) + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + /* Read the field, zero-extended to a u64 value */ + value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); + } /* * Now copy part of this value to register or memory, as requested. @@ -5169,7 +5244,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - offset = vmcs_field_to_offset(field); + offset = get_vmcs12_field_offset(field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); @@ -5238,6 +5313,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) vmx->nested.need_vmcs12_to_shadow_sync = true; } vmx->nested.dirty_vmcs12 = true; + vmx->nested.force_msr_bitmap_recalc = true; } /* Emulate the VMPTRLD instruction */ @@ -5264,10 +5340,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return 1; if (vmx->nested.current_vmptr != vmptr) { - struct kvm_host_map map; - struct vmcs12 *new_vmcs12; + struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; + struct vmcs_hdr hdr; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the @@ -5278,12 +5354,16 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } - new_vmcs12 = map.hva; + if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, + offsetof(struct vmcs12, hdr), + sizeof(hdr))) { + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } - if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || - (new_vmcs12->hdr.shadow_vmcs && + if (hdr.revision_id != VMCS12_REVISION || + (hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { - kvm_vcpu_unmap(vcpu, &map, false); return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -5294,8 +5374,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) * Load VMCS12 from guest memory since it is not already * cached. */ - memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); - kvm_vcpu_unmap(vcpu, &map, false); + if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, + VMCS12_SIZE)) { + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } set_current_vmptr(vmx, vmptr); } @@ -6366,6 +6449,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, goto error_guest_mode; vmx->nested.dirty_vmcs12 = true; + vmx->nested.force_msr_bitmap_recalc = true; ret = nested_vmx_enter_non_root_mode(vcpu, false); if (ret) goto error_guest_mode; @@ -6408,7 +6492,7 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void) max_idx = 0; for (i = 0; i < nr_vmcs12_fields; i++) { /* The vmcs12 table is very, very sparsely populated. */ - if (!vmcs_field_to_offset_table[i]) + if (!vmcs12_field_offsets[i]) continue; idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); @@ -6717,6 +6801,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) } struct kvm_x86_nested_ops vmx_nested_ops = { + .leave_nested = vmx_leave_nested, .check_events = vmx_check_nested_events, .hv_timer_pending = nested_vmx_preemption_timer_pending, .triple_fault = nested_vmx_triple_fault, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 1b7456b2177b..466d18fc0c5d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -21,7 +21,6 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) static struct kvm_event_hw_type_mapping intel_arch_events[] = { - /* Index must match CPUID 0x0A.EBX bit vector */ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, @@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + /* The above index must match CPUID 0x0A.EBX bit vector */ [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, }; @@ -68,34 +68,29 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) reprogram_counter(pmu, bit); } -static unsigned intel_find_arch_event(struct kvm_pmu *pmu, - u8 event_select, - u8 unit_mask) +static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; + u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; - for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) - if (intel_arch_events[i].eventsel == event_select - && intel_arch_events[i].unit_mask == unit_mask - && (pmu->available_event_types & (1 << i))) - break; - - if (i == ARRAY_SIZE(intel_arch_events)) - return PERF_COUNT_HW_MAX; + for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) { + if (intel_arch_events[i].eventsel != event_select || + intel_arch_events[i].unit_mask != unit_mask) + continue; - return intel_arch_events[i].event_type; -} + /* disable event that reported as not present by cpuid */ + if ((i < 7) && !(pmu->available_event_types & (1 << i))) + return PERF_COUNT_HW_MAX + 1; -static unsigned intel_find_fixed_event(int idx) -{ - u32 event; - size_t size = ARRAY_SIZE(fixed_pmc_events); + break; + } - if (idx >= size) + if (i == ARRAY_SIZE(intel_arch_events)) return PERF_COUNT_HW_MAX; - event = fixed_pmc_events[array_index_nospec(idx, size)]; - return intel_arch_events[event].event_type; + return intel_arch_events[i].event_type; } /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ @@ -459,6 +454,21 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } +static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) +{ + size_t size = ARRAY_SIZE(fixed_pmc_events); + struct kvm_pmc *pmc; + u32 event; + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + event = fixed_pmc_events[array_index_nospec(i, size)]; + pmc->eventsel = (intel_arch_events[event].unit_mask << 8) | + intel_arch_events[event].eventsel; + } +} + static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -477,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) + if (!entry || !enable_pmu) return; eax.full = entry->eax; edx.full = entry->edx; @@ -500,12 +510,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = 0; } else { pmu->nr_arch_fixed_counters = - min_t(int, edx.split.num_counters_fixed, - x86_pmu.num_counters_fixed); + min3(ARRAY_SIZE(fixed_pmc_events), + (size_t) edx.split.num_counters_fixed, + (size_t) x86_pmu.num_counters_fixed); edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; + setup_fixed_pmc_eventsel(pmu); } pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | @@ -703,8 +715,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops intel_pmu_ops = { - .find_arch_event = intel_find_arch_event, - .find_fixed_event = intel_find_fixed_event, + .pmc_perf_hw_id = intel_pmc_perf_hw_id, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 5f81ef092bd4..aa1fe9085d77 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -5,63 +5,115 @@ #include <asm/cpu.h> #include "lapic.h" +#include "irq.h" #include "posted_intr.h" #include "trace.h" #include "vmx.h" /* - * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we - * can find which vCPU should be waken up. + * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() + * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when + * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled. + * The vCPUs posted interrupt descriptor is updated at the same time to set its + * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices + * wake the target vCPUs. vCPUs are removed from the list and the notification + * vector is reset when the vCPU is scheduled in. */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); -static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); +static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); +/* + * Protect the per-CPU list with a per-CPU spinlock to handle task migration. + * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the + * ->sched_in() path will need to take the vCPU off the list of the _previous_ + * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will + * occur if a wakeup IRQ arrives and attempts to acquire the lock. + */ +static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { return &(to_vmx(vcpu)->pi_desc); } +static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) +{ + /* + * PID.ON can be set at any time by a different vCPU or by hardware, + * e.g. a device. PID.control must be written atomically, and the + * update must be retried with a fresh snapshot an ON change causes + * the cmpxchg to fail. + */ + if (cmpxchg64(&pi_desc->control, old, new) != old) + return -EBUSY; + + return 0; +} + void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; + unsigned long flags; unsigned int dest; /* - * In case of hot-plug or hot-unplug, we may have to undo - * vmx_vcpu_pi_put even if there is no assigned device. And we - * always keep PI.NDST up to date for simplicity: it makes the - * code easier, and CPU migration is not a fast path. + * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and + * PI.SN up-to-date even if there is no assigned device or if APICv is + * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC. */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + if (!enable_apicv || !lapic_in_kernel(vcpu)) return; /* - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block is the one expected to change PID.NDST and the - * wakeup handler expects the vCPU to be on the blocked_vcpu_list that - * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up - * correctly. + * If the vCPU wasn't on the wakeup list and wasn't migrated, then the + * full update can be skipped as neither the vector nor the destination + * needs to be changed. */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - goto after_clear_sn; + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { + /* + * Clear SN if it was set due to being preempted. Again, do + * this even if there is no assigned device for simplicity. + */ + if (pi_test_and_clear_sn(pi_desc)) + goto after_clear_sn; + return; } - /* The full case. */ - do { - old.control = new.control = pi_desc->control; + local_irq_save(flags); - dest = cpu_physical_id(cpu); + /* + * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup + * list of the _previous_ pCPU, which will not be the same as the + * current pCPU if the task was migrated. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_del(&vmx->pi_wakeup_list); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + } - if (x2apic_mode) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; + dest = cpu_physical_id(cpu); + if (!x2apic_mode) + dest = (dest << 8) & 0xFF00; + + do { + old.control = new.control = READ_ONCE(pi_desc->control); + /* + * Clear SN (as above) and refresh the destination APIC ID to + * handle task migration (@cpu != vcpu->cpu). + */ + new.ndst = dest; new.sn = 0; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); + + /* + * Restore the notification vector; in the blocking case, the + * descriptor was modified on "put" to use the wakeup vector. + */ + new.nv = POSTED_INTR_VECTOR; + } while (pi_try_set_control(pi_desc, old.control, new.control)); + + local_irq_restore(flags); after_clear_sn: @@ -77,130 +129,71 @@ after_clear_sn: pi_set_on(pi_desc); } -void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - -static void __pi_post_block(struct kvm_vcpu *vcpu) +static bool vmx_can_use_vtd_pi(struct kvm *kvm) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - do { - old.control = new.control = pi_desc->control; - WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the VCPU is blocked\n"); - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_mode) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - vcpu->pre_pcpu = -1; - } + return irqchip_in_kernel(kvm) && enable_apicv && + kvm_arch_has_assigned_device(kvm) && + irq_remapping_cap(IRQ_POSTING_CAP); } /* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NDST' <-- vcpu->pre_pcpu - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * + * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set + * WAKEUP as the notification vector in the PI descriptor. */ -int pi_pre_block(struct kvm_vcpu *vcpu) +static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { - unsigned int dest; - struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct pi_desc old, new; + unsigned long flags; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return 0; - - WARN_ON(irqs_disabled()); - local_irq_disable(); - if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { - vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - } - - do { - old.control = new.control = pi_desc->control; + local_irq_save(flags); - WARN((pi_desc->sn == 1), - "Warning: SN field of posted-interrupts " - "is set before blocking\n"); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_add_tail(&vmx->pi_wakeup_list, + &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - /* - * Since vCPU can be preempted during this process, - * vcpu->cpu could be different with pre_pcpu, we - * need to set pre_pcpu as the destination of wakeup - * notification event, then we can find the right vCPU - * to wakeup in wakeup handler if interrupts happen - * when the vCPU is in blocked state. - */ - dest = cpu_physical_id(vcpu->pre_pcpu); + WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); - if (x2apic_mode) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; + do { + old.control = new.control = READ_ONCE(pi_desc->control); /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); + } while (pi_try_set_control(pi_desc, old.control, new.control)); - /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc) == 1) - __pi_post_block(vcpu); + /* + * Send a wakeup IPI to this CPU if an interrupt may have been posted + * before the notification vector was updated, in which case the IRQ + * will arrive on the non-wakeup vector. An IPI is needed as calling + * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not + * enabled until it is safe to call try_to_wake_up() on the task being + * scheduled out). + */ + if (pi_test_on(&new)) + apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); - local_irq_enable(); - return (vcpu->pre_pcpu == -1); + local_irq_restore(flags); } -void pi_post_block(struct kvm_vcpu *vcpu) +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { - if (vcpu->pre_pcpu == -1) + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return; - WARN_ON(irqs_disabled()); - local_irq_disable(); - __pi_post_block(vcpu); - local_irq_enable(); + if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) + pi_enable_wakeup_handler(vcpu); + + /* + * Set SN when the vCPU is preempted. Note, the vCPU can both be seen + * as blocking and preempted, e.g. if it's preempted between setting + * its wait state and manually scheduling out. + */ + if (vcpu->preempted) + pi_set_sn(pi_desc); } /* @@ -208,24 +201,23 @@ void pi_post_block(struct kvm_vcpu *vcpu) */ void pi_wakeup_handler(void) { - struct kvm_vcpu *vcpu; int cpu = smp_processor_id(); + struct vcpu_vmx *vmx; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); + list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu), + pi_wakeup_list) { - if (pi_test_on(pi_desc) == 1) - kvm_vcpu_kick(vcpu); + if (pi_test_on(&vmx->pi_desc)) + kvm_vcpu_wake_up(&vmx->vcpu); } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } void __init pi_init_cpu(int cpu) { - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu)); + raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) @@ -241,7 +233,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) * Bail out of the block loop if the VM has an assigned * device, but the blocking vCPU didn't reconfigure the * PI.NV to the wakeup vector, i.e. the assigned device - * came along after the initial check in pi_pre_block(). + * came along after the initial check in vmx_vcpu_pi_put(). */ void vmx_pi_start_assignment(struct kvm *kvm) { @@ -270,9 +262,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct vcpu_data vcpu_info; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) + if (!vmx_can_use_vtd_pi(kvm)) return 0; idx = srcu_read_lock(&kvm->irq_srcu); diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 7f7b2326caf5..eb14e76b84ef 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -40,7 +40,13 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } -static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } @@ -74,13 +80,13 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } -static inline int pi_test_on(struct pi_desc *pi_desc) +static inline bool pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); } -static inline int pi_test_sn(struct pi_desc *pi_desc) +static inline bool pi_test_sn(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); @@ -88,8 +94,6 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); -int pi_pre_block(struct kvm_vcpu *vcpu); -void pi_post_block(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 6e5de2e2b0da..e325c290a816 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -129,6 +129,11 @@ static inline bool is_machine_check(u32 intr_info) return is_exception_n(intr_info, MC_VECTOR); } +static inline bool is_nm_fault(u32 intr_info) +{ + return is_exception_n(intr_info, NM_VECTOR); +} + /* Undocumented: icebp/int1 */ static inline bool is_icebp(u32 intr_info) { diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index cab6ba7a5005..2251b60920f8 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -8,7 +8,7 @@ FIELD(number, name), \ [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) -const unsigned short vmcs_field_to_offset_table[] = { +const unsigned short vmcs12_field_offsets[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), FIELD(POSTED_INTR_NV, posted_intr_nv), FIELD(GUEST_ES_SELECTOR, guest_es_selector), @@ -151,4 +151,4 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD(HOST_RSP, host_rsp), FIELD(HOST_RIP, host_rip), }; -const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table); +const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets); diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 2a45f026ee11..746129ddd5ae 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -361,10 +361,10 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(guest_pml_index, 996); } -extern const unsigned short vmcs_field_to_offset_table[]; +extern const unsigned short vmcs12_field_offsets[]; extern const unsigned int nr_vmcs12_fields; -static inline short vmcs_field_to_offset(unsigned long field) +static inline short get_vmcs12_field_offset(unsigned long field) { unsigned short offset; unsigned int index; @@ -377,7 +377,7 @@ static inline short vmcs_field_to_offset(unsigned long field) return -ENOENT; index = array_index_nospec(index, nr_vmcs12_fields); - offset = vmcs_field_to_offset_table[index]; + offset = vmcs12_field_offsets[index]; if (offset == 0) return -ENOENT; return offset; diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 3a6461694fc2..435c187927c4 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -49,14 +49,14 @@ SYM_FUNC_START_LOCAL(vmx_vmenter) je 2f 1: vmresume - ret + RET 2: vmlaunch - ret + RET 3: cmpb $0, kvm_rebooting je 4f - ret + RET 4: ud2 _ASM_EXTABLE(1b, 3b) @@ -89,7 +89,7 @@ SYM_FUNC_START(vmx_vmexit) pop %_ASM_AX .Lvmexit_skip_rsb: #endif - ret + RET SYM_FUNC_END(vmx_vmexit) /** @@ -228,7 +228,7 @@ SYM_FUNC_START(__vmx_vcpu_run) pop %edi #endif pop %_ASM_BP - ret + RET /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ 2: mov $1, %eax @@ -293,7 +293,7 @@ SYM_FUNC_START(vmread_error_trampoline) pop %_ASM_AX pop %_ASM_BP - ret + RET SYM_FUNC_END(vmread_error_trampoline) SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) @@ -326,5 +326,5 @@ SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) */ mov %_ASM_BP, %_ASM_SP pop %_ASM_BP - ret + RET SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ba66c171d951..b730d799c26e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -36,6 +36,7 @@ #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/fpu/api.h> +#include <asm/fpu/xstate.h> #include <asm/idtentry.h> #include <asm/io.h> #include <asm/irq_remapping.h> @@ -161,6 +162,8 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE, + MSR_IA32_XFD, + MSR_IA32_XFD_ERR, #endif MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, @@ -602,15 +605,13 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, unsigned int slot = msr - vmx->guest_uret_msrs; int ret = 0; - u64 old_msr_data = msr->data; - msr->data = data; if (msr->load_into_hardware) { preempt_disable(); - ret = kvm_set_user_return_msr(slot, msr->data, msr->mask); + ret = kvm_set_user_return_msr(slot, data, msr->mask); preempt_enable(); - if (ret) - msr->data = old_msr_data; } + if (!ret) + msr->data = data; return ret; } @@ -763,6 +764,14 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); } + /* + * Disabling xfd interception indicates that dynamic xfeatures + * might be used in the guest. Always trap #NM in this case + * to save guest xfd_err timely. + */ + if (vcpu->arch.xfd_no_write_intercept) + eb |= (1u << NM_VECTOR); + vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -1271,7 +1280,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, if (!already_loaded) { void *gdt = get_current_gdt_ro(); - unsigned long sysenter_esp; /* * Flush all EPTP/VPID contexts, the new pCPU may have stale @@ -1287,8 +1295,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { + /* 22.2.3 */ + vmcs_writel(HOST_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(cpu) + 1)); + } vmx->loaded_vmcs->cpu = cpu; } @@ -1363,6 +1374,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmx->emulation_required = vmx_emulation_required(vcpu); } +static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) +{ + return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; +} + u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -1464,11 +1480,12 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does - * not point tthe failing instruction, and even if it did, the code + * not point at the failing instruction, and even if it did, the code * stream is inaccessible. Inject #UD instead of exiting to userspace * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). @@ -1748,7 +1765,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) } /* - * Reads an msr value (of 'msr_index') into 'pdata'. + * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ @@ -1955,6 +1972,24 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KERNEL_GS_BASE: vmx_write_guest_kernel_gs_base(vmx, data); break; + case MSR_IA32_XFD: + ret = kvm_set_msr_common(vcpu, msr_info); + /* + * Always intercepting WRMSR could incur non-negligible + * overhead given xfd might be changed frequently in + * guest context switch. Disable write interception + * upon the first write with a non-zero value (indicating + * potential usage on dynamic xfeatures). Also update + * exception bitmap to trap #NM for proper virtualization + * of guest xfd_err. + */ + if (!ret && data) { + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, + MSR_TYPE_RW); + vcpu->arch.xfd_no_write_intercept = true; + vmx_update_exception_bitmap(vcpu); + } + break; #endif case MSR_IA32_SYSENTER_CS: if (is_guest_mode(vcpu)) @@ -2095,9 +2130,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } ret = kvm_set_msr_common(vcpu, msr_info); break; - case MSR_IA32_TSC_ADJUST: - ret = kvm_set_msr_common(vcpu, msr_info); - break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated && !(to_vmx(vcpu)->msr_ia32_feature_control & @@ -2565,7 +2597,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, return -EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; vmcs_conf->revision_id = vmx_msr_low; @@ -2590,7 +2621,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) struct page *pages; struct vmcs *vmcs; - pages = __alloc_pages_node(node, flags, vmcs_config.order); + pages = __alloc_pages_node(node, flags, 0); if (!pages) return NULL; vmcs = page_address(pages); @@ -2609,7 +2640,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) void free_vmcs(struct vmcs *vmcs) { - free_pages((unsigned long)vmcs, vmcs_config.order); + free_page((unsigned long)vmcs); } /* @@ -2646,15 +2677,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) if (!loaded_vmcs->msr_bitmap) goto out_vmcs; memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); - - if (IS_ENABLED(CONFIG_HYPERV) && - static_branch_unlikely(&enable_evmcs) && - (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { - struct hv_enlightened_vmcs *evmcs = - (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; - - evmcs->hv_enlightenments_control.msr_bitmap = 1; - } } memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); @@ -2918,6 +2940,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) } } +static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + return nested_get_vpid02(vcpu); + return to_vmx(vcpu)->vpid; +} + static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -2930,31 +2959,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) if (enable_ept) ept_sync_context(construct_eptp(vcpu, root_hpa, mmu->shadow_root_level)); - else if (!is_guest_mode(vcpu)) - vpid_sync_context(to_vmx(vcpu)->vpid); else - vpid_sync_context(nested_get_vpid02(vcpu)); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* - * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in + * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in * vmx_flush_tlb_guest() for an explanation of why this is ok. */ - vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr); + vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* - * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0 - * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit - * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is + * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a + * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are + * required to flush GVA->{G,H}PA mappings from the TLB if vpid is * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), * i.e. no explicit INVVPID is necessary. */ - vpid_sync_context(to_vmx(vcpu)->vpid); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -2984,7 +3011,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); } #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ @@ -3066,6 +3093,13 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ if ((old_cr0_pg ^ cr0) & X86_CR0_PG) vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + + /* + * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but + * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. + */ + if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); } /* depends on vcpu->arch.cr0 to be set to a new value */ @@ -3109,9 +3143,9 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; - else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) guest_cr3 = vcpu->arch.cr3; - else /* vmcs01.GUEST_CR3 is already up-to-date. */ + else /* vmcs.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { @@ -3122,6 +3156,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcs_writel(GUEST_CR3, guest_cr3); } + static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* @@ -3686,6 +3721,19 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } +static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) +{ + /* + * When KVM is a nested hypervisor on top of Hyper-V and uses + * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR + * bitmap has changed. + */ + if (static_branch_unlikely(&enable_evmcs)) + evmcs_touch_msr_bitmap(); + + vmx->nested.force_msr_bitmap_recalc = true; +} + void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3694,8 +3742,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) if (!cpu_has_vmx_msr_bitmap()) return; - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed @@ -3739,8 +3786,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) if (!cpu_has_vmx_msr_bitmap()) return; - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed @@ -3878,12 +3924,10 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) pt_update_intercept_for_msr(vcpu); } -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - bool nested) +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + int pi_vec) { #ifdef CONFIG_SMP - int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; - if (vcpu->mode == IN_GUEST_MODE) { /* * The vector of interrupt to be delivered to vcpu had @@ -3911,10 +3955,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, */ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return true; + return; } #endif - return false; + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); } static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, @@ -3930,9 +3979,21 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, */ vmx->nested.pi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * This pairs with the smp_mb_*() after setting vcpu->mode in + * vcpu_enter_guest() to guarantee the vCPU sees the event + * request if triggering a posted interrupt "fails" because + * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as + * the smb_wmb() in kvm_make_request() only ensures everything + * done before making the request is visible when the request + * is visible, it doesn't ensure ordering between the store to + * vcpu->requests and the load from vcpu->mode. + */ + smp_mb__after_atomic(); + /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) - kvm_vcpu_kick(vcpu); + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); return 0; } return -1; @@ -3963,13 +4024,31 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return 0; - if (vcpu != kvm_get_running_vcpu() && - !kvm_vcpu_trigger_posted_interrupt(vcpu, false)) - kvm_vcpu_kick(vcpu); - + /* + * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() + * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is + * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a + * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. + */ + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); return 0; } +static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + + if (vmx_deliver_posted_interrupt(vcpu, vector)) { + kvm_lapic_set_irr(vector, apic); + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } else { + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); + } +} + /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -4021,6 +4100,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); vmcs_write32(HOST_IA32_SYSENTER_CS, low32); + + /* + * SYSENTER is used for 32-bit system calls on either 32-bit or + * 64-bit kernels. It is always zero If neither is allowed, otherwise + * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may + * have already done so!). + */ + if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) + vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); + rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ @@ -4039,8 +4128,10 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & ~vcpu->arch.cr4_guest_rsvd_bits; - if (!enable_ept) - vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; + if (!enable_ept) { + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; + } if (is_guest_mode(&vmx->vcpu)) vcpu->arch.cr4_guest_owned_bits &= ~get_vmcs12(vcpu)->cr4_guest_host_mask; @@ -4692,7 +4783,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (kvm_emulate_instruction(vcpu, 0)) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } return 1; } @@ -4748,6 +4839,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info) || is_nmi(intr_info)) return 1; /* handled by handle_exception_nmi_irqoff() */ + /* + * Queue the exception here instead of in handle_nm_fault_irqoff(). + * This ensures the nested_vmx check is not skipped so vmexit can + * be reflected to L1 (when it intercepts #NM) before reaching this + * point. + */ + if (is_nm_fault(intr_info)) { + kvm_queue_exception(vcpu, NM_VECTOR); + return 1; + } + if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); @@ -4811,8 +4913,33 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) dr6 = vmx_get_exit_qual(vcpu); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + /* + * If the #DB was due to ICEBP, a.k.a. INT1, skip the + * instruction. ICEBP generates a trap-like #DB, but + * despite its interception control being tied to #DB, + * is an instruction intercept, i.e. the VM-Exit occurs + * on the ICEBP itself. Note, skipping ICEBP also + * clears STI and MOVSS blocking. + * + * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS + * if single-step is enabled in RFLAGS and STI or MOVSS + * blocking is active, as the CPU doesn't set the bit + * on VM-Exit due to #DB interception. VM-Entry has a + * consistency check that a single-step #DB is pending + * in this scenario as the previous instruction cannot + * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV + * don't modify RFLAGS), therefore the one instruction + * delay when activating single-step breakpoints must + * have already expired. Note, the CPU sets/clears BS + * as appropriate for all other VM-Exits types. + */ if (is_icebp(intr_info)) WARN_ON(!skip_emulated_instruction(vcpu)); + else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; @@ -5307,7 +5434,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; - if (!vmx_can_emulate_instruction(vcpu, NULL, 0)) + if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) return 1; /* @@ -5336,6 +5463,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } +static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->emulation_required && !vmx->rmode.vm86_active && + vcpu->arch.exception.pending; +} + static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5355,15 +5490,14 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) { + if (vmx_emulation_required_with_pending_exception(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } /* @@ -5378,6 +5512,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 1; } +static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (vmx_emulation_required_with_pending_exception(vcpu)) { + kvm_prepare_emulation_failure_exit(vcpu); + return 0; + } + + return 1; +} + static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5881,18 +6025,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vmx_flush_pml_buffer(vcpu); /* - * We should never reach this point with a pending nested VM-Enter, and - * more specifically emulation of L2 due to invalid guest state (see - * below) should never happen as that means we incorrectly allowed a - * nested VM-Enter with an invalid vmcs12. + * KVM should never reach this point with a pending nested VM-Enter. + * More specifically, short-circuiting VM-Entry to emulate L2 due to + * invalid guest state should never happen as that means KVM knowingly + * allowed a nested VM-Enter with an invalid vmcs12. More below. */ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) return -EIO; - /* If guest state is invalid, start emulating */ - if (vmx->emulation_required) - return handle_invalid_guest_state(vcpu); - if (is_guest_mode(vcpu)) { /* * PML is never enabled when running L2, bail immediately if a @@ -5914,10 +6054,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) */ nested_mark_vmcs12_pages_dirty(vcpu); + /* + * Synthesize a triple fault if L2 state is invalid. In normal + * operation, nested VM-Enter rejects any attempt to enter L2 + * with invalid state. However, those checks are skipped if + * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If + * L2 state is invalid, it means either L1 modified SMRAM state + * or userspace provided bad state. Synthesize TRIPLE_FAULT as + * doing so is architecturally allowed in the RSM case, and is + * the least awful solution for the userspace case without + * risking false positives. + */ + if (vmx->emulation_required) { + nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); + return 1; + } + if (nested_vmx_reflect_vmexit(vcpu)) return 1; } + /* If guest state is invalid, start emulating. L2 is handled above. */ + if (vmx->emulation_required) + return handle_invalid_guest_state(vcpu); + if (exit_reason.failed_vmentry) { dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -6262,9 +6422,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; - bool max_irr_updated; + bool got_posted_interrupt; - if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; if (pi_test_on(&vmx->pi_desc)) { @@ -6274,22 +6434,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - max_irr_updated = + got_posted_interrupt = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); - - /* - * If we are running L2 and L1 has a new pending interrupt - * which can be injected, this may cause a vmexit or it may - * be injected into L2. Either way, this interrupt will be - * processed via KVM_REQ_EVENT, not RVI, because we do not use - * virtual interrupt delivery to inject L1 interrupts into L2. - */ - if (is_guest_mode(vcpu) && max_irr_updated) - kvm_make_request(KVM_REQ_EVENT, vcpu); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); + got_posted_interrupt = false; } - vmx_hwapic_irr_update(vcpu, max_irr); + + /* + * Newly recognized interrupts are injected via either virtual interrupt + * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is + * disabled in two cases: + * + * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 + * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a + * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected + * into L2, but KVM doesn't use virtual interrupt delivery to inject + * interrupts into L2, and so KVM_REQ_EVENT is again needed. + * + * 2) If APICv is disabled for this vCPU, assigned devices may still + * attempt to post interrupts. The posted interrupt vector will cause + * a VM-Exit and the subsequent entry will call sync_pir_to_irr. + */ + if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) + vmx_set_rvi(max_irr); + else if (got_posted_interrupt) + kvm_make_request(KVM_REQ_EVENT, vcpu); + return max_irr; } @@ -6317,11 +6488,33 @@ void vmx_do_interrupt_nmi_irqoff(unsigned long entry); static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, unsigned long entry) { - kvm_before_interrupt(vcpu); + bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist; + + kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); vmx_do_interrupt_nmi_irqoff(entry); kvm_after_interrupt(vcpu); } +static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) +{ + /* + * Save xfd_err to guest_fpu before interrupt is enabled, so the + * MSR value is not clobbered by the host activity before the guest + * has chance to consume it. + * + * Do not blindly read xfd_err here, since this exception might + * be caused by L1 interception on a platform which doesn't + * support xfd at all. + * + * Do it conditionally upon guest_fpu::xfd. xfd_err matters + * only when xfd contains a non-zero value. + * + * Queuing exception is done in vmx_handle_exit. See comment there. + */ + if (vcpu->arch.guest_fpu.fpstate->xfd) + rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); +} + static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; @@ -6330,6 +6523,9 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) /* if exit due to PF check for async PF */ if (is_page_fault(intr_info)) vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); + /* if exit due to NM, handle before interrupts are enabled */ + else if (is_nm_fault(intr_info)) + handle_nm_fault_irqoff(&vmx->vcpu); /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); @@ -6566,7 +6762,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - kvm_guest_enter_irqoff(); + guest_state_enter_irqoff(); /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6582,7 +6778,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = native_read_cr2(); - kvm_guest_exit_irqoff(); + guest_state_exit_irqoff(); } static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) @@ -6601,9 +6797,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) * consistency check VM-Exit due to invalid guest state and bail. */ if (unlikely(vmx->emulation_required)) { - - /* We don't emulate invalid state of a nested guest */ - vmx->fail = is_guest_mode(vcpu); + vmx->fail = 0; vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; vmx->exit_reason.failed_vmentry = 1; @@ -6631,7 +6825,15 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + vcpu->arch.regs_dirty = 0; + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time + * it switches back to the current->mm, which can occur in KVM context + * when switching to a temporary mm to patch kernel code, e.g. if KVM + * toggles a static key while handling a VM-Exit. + */ cr3 = __get_current_cr3_fast(); if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); @@ -6725,7 +6927,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) loadsegment(es, __USER_DS); #endif - vmx_register_cache_reset(vcpu); + vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; pt_guest_exit(vmx); @@ -6793,6 +6995,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); + INIT_LIST_HEAD(&vmx->pi_wakeup_list); + err = -ENOMEM; vmx->vpid = allocate_vpid(); @@ -6826,6 +7030,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) if (err < 0) goto free_pml; + /* + * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a + * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the + * feature only for vmcs01, KVM currently isn't equipped to realize any + * performance benefits from enabling it for vmcs02. + */ + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) && + (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; + + evmcs->hv_enlightenments_control.msr_bitmap = 1; + } + /* The MSR bitmap starts with all ones */ bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); @@ -6931,7 +7148,6 @@ static int __init vmx_check_processor_compat(void) static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; - u64 ipat = 0; /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in * memory aliases with conflicting memory types and sometimes MCEs. @@ -6951,30 +7167,22 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * EPT memory type is used to emulate guest CD/MTRR. */ - if (is_mmio) { - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } + if (is_mmio) + return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { - ipat = VMX_EPT_IPAT_BIT; - cache = MTRR_TYPE_WRBACK; - goto exit; - } + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; if (kvm_read_cr0(vcpu) & X86_CR0_CD) { - ipat = VMX_EPT_IPAT_BIT; if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cache = MTRR_TYPE_WRBACK; else cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); + return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; + } -exit: - return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; + return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; } static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) @@ -7166,6 +7374,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) } } + if (kvm_cpu_cap_has(X86_FEATURE_XFD)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); + + set_cr4_guest_host_mask(vmx); vmx_write_encls_bitmap(vcpu, NULL); @@ -7405,25 +7618,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - if (pi_pre_block(vcpu)) - return 1; - - if (kvm_lapic_hv_timer_in_use(vcpu)) - kvm_lapic_switch_to_sw_timer(vcpu); - - return 0; -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - if (kvm_x86_ops.set_hv_timer) - kvm_lapic_switch_to_hv_timer(vcpu); - - pi_post_block(vcpu); -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7471,6 +7665,7 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (ret) return ret; + vmx->nested.nested_run_pending = 1; vmx->nested.smm.guest_mode = false; } return 0; @@ -7509,6 +7704,7 @@ static void hardware_unsetup(void) static bool vmx_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_ABSENT) | BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ); @@ -7558,12 +7754,14 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .get_if_flag = vmx_get_if_flag, .tlb_flush_all = vmx_flush_tlb_all, .tlb_flush_current = vmx_flush_tlb_current, .tlb_flush_gva = vmx_flush_tlb_gva, .tlb_flush_guest = vmx_flush_tlb_guest, + .vcpu_pre_run = vmx_vcpu_pre_run, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, @@ -7592,7 +7790,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .deliver_interrupt = vmx_deliver_interrupt, .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, @@ -7622,9 +7820,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, @@ -7653,6 +7848,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; +static unsigned int vmx_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* '0' on failure so that the !PT case can use a RET0 static call. */ + if (!kvm_arch_pmi_in_guest(vcpu)) + return 0; + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)&vcpu->arch.pmu.global_status); + return 1; +} + static __init void vmx_setup_user_return_msrs(void) { @@ -7679,11 +7888,13 @@ static __init void vmx_setup_user_return_msrs(void) kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } +static struct kvm_x86_init_ops vmx_init_ops __initdata; + static __init int hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; - int r, ept_lpage_level; + int r; store_idt(&dt); host_idt_base = dt.address; @@ -7761,10 +7972,10 @@ static __init int hardware_setup(void) ple_window_shrink = 0; } - if (!cpu_has_vmx_apicv()) { + if (!cpu_has_vmx_apicv()) enable_apicv = 0; + if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; - } if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true; @@ -7780,16 +7991,8 @@ static __init int hardware_setup(void) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); - if (!enable_ept) - ept_lpage_level = 0; - else if (cpu_has_vmx_ept_1g_page()) - ept_lpage_level = PG_LEVEL_1G; - else if (cpu_has_vmx_ept_2m_page()) - ept_lpage_level = PG_LEVEL_2M; - else - ept_lpage_level = PG_LEVEL_4K; kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), - ept_lpage_level); + ept_caps_to_lpage_level(vmx_capability.ept)); /* * Only enable PML when hardware supports PML feature, and both EPT @@ -7837,6 +8040,10 @@ static __init int hardware_setup(void) return -EINVAL; if (!enable_ept || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + if (pt_mode == PT_MODE_HOST_GUEST) + vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + else + vmx_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); @@ -7865,6 +8072,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = { .disabled_by_bios = vmx_disabled_by_bios, .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, + .handle_intel_pt_intr = NULL, .runtime_ops = &vmx_x86_ops, }; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a4ead6023133..9c6bfcd84008 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -142,6 +142,16 @@ struct nested_vmx { struct vmcs12 *cached_shadow_vmcs12; /* + * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer + */ + struct gfn_to_hva_cache shadow_vmcs12_cache; + + /* + * GPA to HVA cache for VMCS12 + */ + struct gfn_to_hva_cache vmcs12_cache; + + /* * Indicates if the shadow vmcs or enlightened vmcs must be updated * with the data held by struct vmcs12. */ @@ -149,6 +159,15 @@ struct nested_vmx { bool dirty_vmcs12; /* + * Indicates whether MSR bitmap for L2 needs to be rebuilt due to + * changes in MSR bitmap for L1 or switching to a different L2. Note, + * this flag can only be used reliably in conjunction with a paravirt L1 + * which informs L0 whether any changes to MSR bitmap for L2 were done + * on its side. + */ + bool force_msr_bitmap_recalc; + + /* * Indicates lazily loaded guest state has not yet been decached from * vmcs02. */ @@ -298,6 +317,9 @@ struct vcpu_vmx { /* Posted interrupt descriptor */ struct pi_desc pi_desc; + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; @@ -330,7 +352,7 @@ struct vcpu_vmx { struct lbr_desc lbr_desc; /* Save desired MSR intercept (read: pass-through) state */ -#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13 +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15 struct { DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); @@ -463,19 +485,21 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) -static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) -{ - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) - | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_PDPTR) - | (1 << VCPU_EXREG_SEGMENTS) - | (1 << VCPU_EXREG_CR0) - | (1 << VCPU_EXREG_CR3) - | (1 << VCPU_EXREG_CR4) - | (1 << VCPU_EXREG_EXIT_INFO_1) - | (1 << VCPU_EXREG_EXIT_INFO_2)); - vcpu->arch.regs_dirty = 0; -} +/* + * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the + * cache on demand. Other registers not listed here are synced to + * the cache immediately after VM-Exit. + */ +#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \ + (1 << VCPU_REGS_RSP) | \ + (1 << VCPU_EXREG_RFLAGS) | \ + (1 << VCPU_EXREG_PDPTR) | \ + (1 << VCPU_EXREG_SEGMENTS) | \ + (1 << VCPU_EXREG_CR0) | \ + (1 << VCPU_EXREG_CR3) | \ + (1 << VCPU_EXREG_CR4) | \ + (1 << VCPU_EXREG_EXIT_INFO_1) | \ + (1 << VCPU_EXREG_EXIT_INFO_2)) static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 9e9ef47e988c..5e7f41225780 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -71,6 +71,31 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) { unsigned long value; +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + + asm_volatile_goto("1: vmread %[field], %[output]\n\t" + "jna %l[do_fail]\n\t" + + _ASM_EXTABLE(1b, %l[do_exception]) + + : [output] "=r" (value) + : [field] "r" (field) + : "cc" + : do_fail, do_exception); + + return value; + +do_fail: + WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field); + pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field); + return 0; + +do_exception: + kvm_spurious_fault(); + return 0; + +#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + asm volatile("1: vmread %2, %1\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" @@ -80,9 +105,11 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) * @field, and bounce through the trampoline to preserve * volatile registers. */ - "push $0\n\t" + "xorl %k1, %k1\n\t" + "2:\n\t" + "push %1\n\t" "push %2\n\t" - "2:call vmread_error_trampoline\n\t" + "call vmread_error_trampoline\n\t" /* * Unwind the stack. Note, the trampoline zeros out the @@ -93,14 +120,12 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) "3:\n\t" /* VMREAD faulted. As above, except push '1' for @fault. */ - ".pushsection .fixup, \"ax\"\n\t" - "4: push $1\n\t" - "push %2\n\t" - "jmp 2b\n\t" - ".popsection\n\t" - _ASM_EXTABLE(1b, 4b) - : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc"); + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1) + + : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc"); return value; + +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ } static __always_inline u16 vmcs_read16(unsigned long field) |