diff options
Diffstat (limited to 'arch/x86/kvm/vmx')
| -rw-r--r-- | arch/x86/kvm/vmx/capabilities.h | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/nested.c | 471 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 8 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/vmenter.S | 12 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 91 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 13 |
6 files changed, 321 insertions, 276 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 854e144131c6..d6664ee3d127 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -2,6 +2,8 @@ #ifndef __KVM_X86_VMX_CAPS_H #define __KVM_X86_VMX_CAPS_H +#include <asm/vmx.h> + #include "lapic.h" extern bool __read_mostly enable_vpid; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 153e539c29c9..f1a69117ac0f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -193,10 +193,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) if (!vmx->nested.hv_evmcs) return; - kunmap(vmx->nested.hv_evmcs_page); - kvm_release_page_dirty(vmx->nested.hv_evmcs_page); + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); vmx->nested.hv_evmcs_vmptr = -1ull; - vmx->nested.hv_evmcs_page = NULL; vmx->nested.hv_evmcs = NULL; } @@ -229,16 +227,9 @@ static void free_nested(struct kvm_vcpu *vcpu) kvm_release_page_dirty(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } - if (vmx->nested.virtual_apic_page) { - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - } + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + vmx->nested.pi_desc = NULL; kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); @@ -500,6 +491,17 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, } } +static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { + int msr; + + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + + msr_bitmap[word] = ~0; + msr_bitmap[word + (0x800 / sizeof(long))] = ~0; + } +} + /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. @@ -508,88 +510,95 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { int msr; - struct page *page; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; - /* - * pred_cmd & spec_ctrl are trying to verify two things: - * - * 1. L0 gave a permission to L1 to actually passthrough the MSR. This - * ensures that we do not accidentally generate an L02 MSR bitmap - * from the L12 MSR bitmap that is too permissive. - * 2. That L1 or L2s have actually used the MSR. This avoids - * unnecessarily merging of the bitmap if the MSR is unused. This - * works properly because we only update the L01 MSR bitmap lazily. - * So even if L0 should pass L1 these MSRs, the L01 bitmap is only - * updated to reflect this when L1 (or its L2s) actually write to - * the MSR. - */ - bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); - bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return false; - if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && - !pred_cmd && !spec_ctrl) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) return false; - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); - if (is_error_page(page)) - return false; + msr_bitmap_l1 = (unsigned long *)map->hva; - msr_bitmap_l1 = (unsigned long *)kmap(page); - if (nested_cpu_has_apic_reg_virt(vmcs12)) { - /* - * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it - * just lets the processor take the value from the virtual-APIC page; - * take those 256 bits directly from the L1 bitmap. - */ - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap_l0[word] = msr_bitmap_l1[word]; - msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; - } - } else { - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap_l0[word] = ~0; - msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; - } - } + /* + * To keep the control flow simple, pay eight 8-byte writes (sixteen + * 4-byte writes on 32-bit systems) up front to enable intercepts for + * the x2APIC MSR range and selectively disable them below. + */ + enable_x2apic_msr_intercepts(msr_bitmap_l0); + + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { + if (nested_cpu_has_apic_reg_virt(vmcs12)) { + /* + * L0 need not intercept reads for MSRs between 0x800 + * and 0x8ff, it just lets the processor take the value + * from the virtual-APIC page; take those 256 bits + * directly from the L1 bitmap. + */ + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - X2APIC_MSR(APIC_TASKPRI), - MSR_TYPE_W); + msr_bitmap_l0[word] = msr_bitmap_l1[word]; + } + } - if (nested_cpu_has_vid(vmcs12)) { nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, - X2APIC_MSR(APIC_EOI), - MSR_TYPE_W); - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - X2APIC_MSR(APIC_SELF_IPI), - MSR_TYPE_W); + X2APIC_MSR(APIC_TASKPRI), + MSR_TYPE_R | MSR_TYPE_W); + + if (nested_cpu_has_vid(vmcs12)) { + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_EOI), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_SELF_IPI), + MSR_TYPE_W); + } } - if (spec_ctrl) + /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ + nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, + MSR_FS_BASE, MSR_TYPE_RW); + + nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, + MSR_GS_BASE, MSR_TYPE_RW); + + nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, + MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + + /* + * Checking the L0->L1 bitmap is trying to verify two things: + * + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This + * ensures that we do not accidentally generate an L02 MSR bitmap + * from the L12 MSR bitmap that is too permissive. + * 2. That L1 or L2s have actually used the MSR. This avoids + * unnecessarily merging of the bitmap if the MSR is unused. This + * works properly because we only update the L01 MSR bitmap lazily. + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only + * updated to reflect this when L1 (or its L2s) actually write to + * the MSR. + */ + if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_SPEC_CTRL, MSR_TYPE_R | MSR_TYPE_W); - if (pred_cmd) + if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PRED_CMD, MSR_TYPE_W); - kunmap(page); - kvm_release_page_clean(page); + kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); return true; } @@ -597,20 +606,20 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct kvm_host_map map; struct vmcs12 *shadow; - struct page *page; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == -1ull) return; shadow = get_shadow_vmcs12(vcpu); - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); - memcpy(shadow, kmap(page), VMCS12_SIZE); + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) + return; - kunmap(page); - kvm_release_page_clean(page); + memcpy(shadow, map.hva, VMCS12_SIZE); + kvm_vcpu_unmap(vcpu, &map, false); } static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, @@ -914,7 +923,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { if (!nested_cr3_valid(vcpu, cr3)) { *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; + return -EINVAL; } /* @@ -925,7 +934,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne !nested_ept) { if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { *entry_failure_code = ENTRY_FAIL_PDPTE; - return 1; + return -EINVAL; } } } @@ -1778,13 +1787,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, nested_release_evmcs(vcpu); - vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( - vcpu, assist_page.current_nested_vmcs); - - if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs), + &vmx->nested.hv_evmcs_map)) return 0; - vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); + vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; /* * Currently, KVM only supports eVMCS version 1 @@ -2357,19 +2364,19 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, */ if (vmx->emulation_required) { *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; + return -EINVAL; } /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) - return 1; + return -EINVAL; if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); + kvm_rsp_write(vcpu, vmcs12->guest_rsp); + kvm_rip_write(vcpu, vmcs12->guest_rip); return 0; } @@ -2573,11 +2580,19 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, return 0; } -/* - * Checks related to Host Control Registers and MSRs - */ -static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_check_vm_execution_controls(vcpu, vmcs12) || + nested_check_vm_exit_controls(vcpu, vmcs12) || + nested_check_vm_entry_controls(vcpu, vmcs12)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { bool ia32e; @@ -2590,6 +2605,10 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)) return -EINVAL; + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && + !kvm_pat_valid(vmcs12->host_ia32_pat)) + return -EINVAL; + /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the * IA32_EFER MSR must be 0 in the field for that register. In addition, @@ -2608,41 +2627,12 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, return 0; } -/* - * Checks related to Guest Non-register State - */ -static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) -{ - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (nested_check_vm_execution_controls(vcpu, vmcs12) || - nested_check_vm_exit_controls(vcpu, vmcs12) || - nested_check_vm_entry_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_check_host_control_regs(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - - if (nested_check_guest_non_reg_state(vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - return 0; -} - static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int r; - struct page *page; + int r = 0; struct vmcs12 *shadow; + struct kvm_host_map map; if (vmcs12->vmcs_link_pointer == -1ull) return 0; @@ -2650,23 +2640,34 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) return -EINVAL; - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); - if (is_error_page(page)) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) return -EINVAL; - r = 0; - shadow = kmap(page); + shadow = map.hva; + if (shadow->hdr.revision_id != VMCS12_REVISION || shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) r = -EINVAL; - kunmap(page); - kvm_release_page_clean(page); + + kvm_vcpu_unmap(vcpu, &map, false); return r; } -static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, - u32 *exit_qual) +/* + * Checks related to Guest Non-register State + */ +static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) +{ + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + u32 *exit_qual) { bool ia32e; @@ -2674,11 +2675,15 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) - return 1; + return -EINVAL; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && + !kvm_pat_valid(vmcs12->guest_ia32_pat)) + return -EINVAL; if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; - return 1; + return -EINVAL; } /* @@ -2697,13 +2702,16 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || ((vmcs12->guest_cr0 & X86_CR0_PG) && ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) - return 1; + return -EINVAL; } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || - (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) - return 1; + (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || + (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + return -EINVAL; + + if (nested_check_guest_non_reg_state(vmcs12)) + return -EINVAL; return 0; } @@ -2816,6 +2824,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_host_map *map; struct page *page; u64 hpa; @@ -2848,53 +2857,43 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); + map = &vmx->nested.virtual_apic_map; /* * If translation failed, VM entry will fail because * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. - * Failing the vm entry is _not_ what the processor - * does but it's basically the only possibility we - * have. We could still enter the guest if CR8 load - * exits are enabled, CR8 store exits are enabled, and - * virtualize APIC access is disabled; in this case - * the processor would never use the TPR shadow and we - * could simply clear the bit from the execution - * control. But such a configuration is useless, so - * let's keep the code simple. */ - if (!is_error_page(page)) { - vmx->nested.virtual_apic_page = page; - hpa = page_to_phys(vmx->nested.virtual_apic_page); - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); + } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && + nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + /* + * The processor will never use the TPR shadow, simply + * clear the bit from the execution control. Such a + * configuration is useless, but it happens in tests. + * For any other configuration, failing the vm entry is + * _not_ what the processor does but it's basically the + * only possibility we have. + */ + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_TPR_SHADOW); + } else { + printk("bad virtual-APIC page address\n"); + dump_vmcs(); } } if (nested_cpu_has_posted_intr(vmcs12)) { - if (vmx->nested.pi_desc_page) { /* shouldn't happen */ - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull); + map = &vmx->nested.pi_desc_map; + + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { + vmx->nested.pi_desc = + (struct pi_desc *)(((void *)map->hva) + + offset_in_page(vmcs12->posted_intr_desc_addr)); + vmcs_write64(POSTED_INTR_DESC_ADDR, + pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); - if (is_error_page(page)) - return; - vmx->nested.pi_desc_page = page; - vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); - vmx->nested.pi_desc = - (struct pi_desc *)((void *)vmx->nested.pi_desc + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - vmcs_write64(POSTED_INTR_DESC_ADDR, - page_to_phys(vmx->nested.pi_desc_page) + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); } if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, @@ -2977,7 +2976,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) return -1; } - if (nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) goto vmentry_fail_vmexit; } @@ -3122,9 +3121,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - ret = nested_vmx_check_vmentry_prereqs(vcpu, vmcs12); - if (ret) - return nested_vmx_failValid(vcpu, ret); + if (nested_vmx_check_controls(vcpu, vmcs12)) + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + if (nested_vmx_check_host_state(vcpu, vmcs12)) + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); /* * We're finally done with prerequisite checking, and can start with @@ -3287,11 +3288,12 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr != 256) { - vapic_page = kmap(vmx->nested.virtual_apic_page); + vapic_page = vmx->nested.virtual_apic_map.hva; + if (!vapic_page) + return; + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page, &max_irr); - kunmap(vmx->nested.virtual_apic_page); - status = vmcs_read16(GUEST_INTR_STATUS); if ((u8)max_irr > ((u8)status & 0xff)) { status &= ~0xff; @@ -3402,8 +3404,8 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); - vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); + vmcs12->guest_rsp = kvm_rsp_read(vcpu); + vmcs12->guest_rip = kvm_rip_read(vcpu); vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); @@ -3586,8 +3588,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); vmx_set_efer(vcpu, vcpu->arch.efer); - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); + kvm_rsp_write(vcpu, vmcs12->host_rsp); + kvm_rip_write(vcpu, vmcs12->host_rip); vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); vmx_set_interrupt_shadow(vcpu, 0); @@ -3773,8 +3775,18 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); nested_ept_uninit_mmu_context(vcpu); - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + /* + * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3 + * points to shadow pages! Fortunately we only get here after a WARN_ON + * if EPT is disabled, so a VMabort is perfectly fine. + */ + if (enable_ept) { + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + } else { + nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED); + } /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs @@ -3922,16 +3934,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, kvm_release_page_dirty(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } - if (vmx->nested.virtual_apic_page) { - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - } + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + vmx->nested.pi_desc = NULL; /* * We are now running in L2, mmu_notifier will force to reload the @@ -4227,7 +4232,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; gpa_t vmptr; - struct page *page; + uint32_t revision; struct vcpu_vmx *vmx = to_vmx(vcpu); const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; @@ -4273,20 +4278,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu) * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; * which replaces physical address width with 32 */ - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) - return nested_vmx_failInvalid(vcpu); - - page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) + if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failInvalid(vcpu); - if (*(u32 *)kmap(page) != VMCS12_REVISION) { - kunmap(page); - kvm_release_page_clean(page); + if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || + revision != VMCS12_REVISION) return nested_vmx_failInvalid(vcpu); - } - kunmap(page); - kvm_release_page_clean(page); vmx->nested.vmxon_ptr = vmptr; ret = enter_vmx_operation(vcpu); @@ -4344,7 +4341,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); @@ -4352,7 +4349,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - if (vmx->nested.hv_evmcs_page) { + if (vmx->nested.hv_evmcs_map.hva) { if (vmptr == vmx->nested.hv_evmcs_vmptr) nested_release_evmcs(vcpu); } else { @@ -4551,7 +4548,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); @@ -4564,11 +4561,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return 1; if (vmx->nested.current_vmptr != vmptr) { + struct kvm_host_map map; struct vmcs12 *new_vmcs12; - struct page *page; - page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) { + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the @@ -4578,12 +4574,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } - new_vmcs12 = kmap(page); + + new_vmcs12 = map.hva; + if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || (new_vmcs12->hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { - kunmap(page); - kvm_release_page_clean(page); + kvm_vcpu_unmap(vcpu, &map, false); return nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -4595,8 +4592,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) * cached. */ memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); - kunmap(page); - kvm_release_page_clean(page); + kvm_vcpu_unmap(vcpu, &map, false); set_current_vmptr(vmx, vmptr); } @@ -4771,7 +4767,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; + u32 index = kvm_rcx_read(vcpu); u64 address; bool accessed_dirty; struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -4817,7 +4813,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; - u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; + u32 function = kvm_rax_read(vcpu); /* * VMFUNC is only supported for nested guests, but we always enable the @@ -4903,7 +4899,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 exit_reason) { - u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; + u32 msr_index = kvm_rcx_read(vcpu); gpa_t bitmap; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) @@ -5340,9 +5336,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->format != 0) return -EINVAL; - if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) - nested_enable_evmcs(vcpu, NULL); - if (!nested_vmx_allowed(vcpu)) return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; @@ -5384,13 +5377,16 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->vmx.vmxon_pa == -1ull) return 0; + if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) + nested_enable_evmcs(vcpu, NULL); + vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; ret = enter_vmx_operation(vcpu); if (ret) return ret; /* Empty 'VMXON' state is permitted */ - if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) return 0; if (kvm_state->vmx.vmcs_pa != -1ull) { @@ -5427,14 +5423,11 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) return 0; - vmx->nested.nested_run_pending = - !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); - if (nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); - if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) + if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12)) return -EINVAL; if (copy_from_user(shadow_vmcs12, @@ -5447,14 +5440,20 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; } - if (nested_vmx_check_vmentry_prereqs(vcpu, vmcs12) || - nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_controls(vcpu, vmcs12) || + nested_vmx_check_host_state(vcpu, vmcs12) || + nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) return -EINVAL; vmx->nested.dirty_vmcs12 = true; + vmx->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + ret = nested_vmx_enter_non_root_mode(vcpu, false); - if (ret) + if (ret) { + vmx->nested.nested_run_pending = 0; return -EINVAL; + } return 0; } @@ -5722,6 +5721,14 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; + /* + * Without EPT it is not possible to restore L1's CR3 and PDPTR on + * VMfail, because they are not available in vmcs01. Just always + * use hardware checks. + */ + if (!enable_ept) + nested_early_check = 1; + if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5ab4a364348e..f8502c376b37 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -227,7 +227,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { + if (!(data & pmu->global_ovf_ctrl_mask)) { if (!msr_info->host_initiated) pmu->global_status &= ~data; pmu->global_ovf_ctrl = data; @@ -297,6 +297,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; + pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask + & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | + MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); + if (kvm_x86_ops->pt_supported()) + pmu->global_ovf_ctrl_mask &= + ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry(vcpu, 7, 0); if (entry && diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 7b272738c576..d4cb1945b2e3 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -3,6 +3,7 @@ #include <asm/asm.h> #include <asm/bitsperlong.h> #include <asm/kvm_vcpu_regs.h> +#include <asm/nospec-branch.h> #define WORD_SIZE (BITS_PER_LONG / 8) @@ -77,6 +78,17 @@ ENDPROC(vmx_vmenter) * referred to by VMCS.HOST_RIP. */ ENTRY(vmx_vmexit) +#ifdef CONFIG_RETPOLINE + ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE + /* Preserve guest's RAX, it's used to stuff the RSB. */ + push %_ASM_AX + + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE + + pop %_ASM_AX +.Lvmexit_skip_rsb: +#endif ret ENDPROC(vmx_vmexit) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ab432a930ae8..1ac167614032 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1692,6 +1692,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_POWER_CTL: + msr_info->data = vmx->msr_ia32_power_ctl; + break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -1822,6 +1825,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_POWER_CTL: + vmx->msr_ia32_power_ctl = data; + break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -1891,7 +1897,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + if (!kvm_pat_valid(data)) return 1; vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; @@ -2288,7 +2294,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_CLEAR_BNDCFGS | @@ -3619,14 +3624,13 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || !nested_cpu_has_vid(get_vmcs12(vcpu)) || - WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) + WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) return false; rvi = vmx_get_rvi(); - vapic_page = kmap(vmx->nested.virtual_apic_page); + vapic_page = vmx->nested.virtual_apic_map.hva; vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); - kunmap(vmx->nested.virtual_apic_page); return ((rvi & 0xf0) > (vppr & 0xf0)); } @@ -4827,7 +4831,7 @@ static int handle_cpuid(struct kvm_vcpu *vcpu) static int handle_rdmsr(struct kvm_vcpu *vcpu) { - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + u32 ecx = kvm_rcx_read(vcpu); struct msr_data msr_info; msr_info.index = ecx; @@ -4840,18 +4844,16 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) trace_kvm_msr_read(ecx, msr_info.data); - /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; + kvm_rax_write(vcpu, msr_info.data & -1u); + kvm_rdx_write(vcpu, (msr_info.data >> 32) & -1u); return kvm_skip_emulated_instruction(vcpu); } static int handle_wrmsr(struct kvm_vcpu *vcpu) { struct msr_data msr; - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + u32 ecx = kvm_rcx_read(vcpu); + u64 data = kvm_read_edx_eax(vcpu); msr.data = data; msr.index = ecx; @@ -4922,7 +4924,7 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu) static int handle_xsetbv(struct kvm_vcpu *vcpu) { u64 new_bv = kvm_read_edx_eax(vcpu); - u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); + u32 index = kvm_rcx_read(vcpu); if (kvm_set_xcr(vcpu, index, new_bv) == 0) return kvm_skip_emulated_instruction(vcpu); @@ -5603,7 +5605,7 @@ static void vmx_dump_dtsel(char *name, uint32_t limit) vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } -static void dump_vmcs(void) +void dump_vmcs(void) { u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); @@ -5723,8 +5725,16 @@ static void dump_vmcs(void) if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) pr_err("TSC Multiplier = 0x%016llx\n", vmcs_read64(TSC_MULTIPLIER)); - if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) - pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { + u16 status = vmcs_read16(GUEST_INTR_STATUS); + pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); + } + pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) + pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); + pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); + } if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) @@ -6410,6 +6420,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + kvm_load_guest_xcr0(vcpu); + if (static_cpu_has(X86_FEATURE_PKU) && kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && vcpu->arch.pkru != vmx->host_pkru) @@ -6429,8 +6441,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); + /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); + else if (static_branch_unlikely(&mds_user_clear)) + mds_clear_cpu_buffers(); if (vcpu->arch.cr2 != read_cr2()) write_cr2(vcpu->arch.cr2); @@ -6460,9 +6475,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); - /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) current_evmcs->hv_clean_fields |= @@ -6501,11 +6513,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ if (static_cpu_has(X86_FEATURE_PKU) && kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { - vcpu->arch.pkru = __read_pkru(); + vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vmx->host_pkru) __write_pkru(vmx->host_pkru); } + kvm_put_guest_xcr0(vcpu); + vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; @@ -6667,8 +6681,8 @@ free_partial_vcpu: return ERR_PTR(err); } -#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" -#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" static int vmx_vm_init(struct kvm *kvm) { @@ -6999,10 +7013,12 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, return 0; } -static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) +static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; + struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; if (kvm_mwait_in_guest(vcpu->kvm)) return -EOPNOTSUPP; @@ -7011,7 +7027,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) tscl = rdtsc(); guest_tscl = kvm_read_l1_tsc(vcpu, tscl); delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; - lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns); + lapic_timer_advance_cycles = nsec_to_cycles(vcpu, + ktimer->timer_advance_ns); if (delta_tsc > lapic_timer_advance_cycles) delta_tsc -= lapic_timer_advance_cycles; @@ -7020,10 +7037,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) /* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && - u64_shl_div_u64(delta_tsc, + delta_tsc && u64_shl_div_u64(delta_tsc, kvm_tsc_scaling_ratio_frac_bits, - vcpu->arch.tsc_scaling_ratio, - &delta_tsc)) + vcpu->arch.tsc_scaling_ratio, &delta_tsc)) return -ERANGE; /* @@ -7036,7 +7052,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) return -ERANGE; vmx->hv_deadline_tsc = tscl + delta_tsc; - return delta_tsc == 0; + *expired = !delta_tsc; + return 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) @@ -7073,9 +7090,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t gpa; - struct page *page = NULL; - u64 *pml_address; + gpa_t gpa, dst; if (is_guest_mode(vcpu)) { WARN_ON_ONCE(vmx->nested.pml_full); @@ -7095,15 +7110,13 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) } gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); - if (is_error_page(page)) + if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, + offset_in_page(dst), sizeof(gpa))) return 0; - pml_address = kmap(page); - pml_address[vmcs12->guest_pml_index--] = gpa; - kunmap(page); - kvm_release_page_clean(page); + vmcs12->guest_pml_index--; } return 0; @@ -7369,7 +7382,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -7380,9 +7393,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) } if (vmx->nested.smm.guest_mode) { - vcpu->arch.hflags &= ~HF_SMM_MASK; ret = nested_vmx_enter_non_root_mode(vcpu, false); - vcpu->arch.hflags |= HF_SMM_MASK; if (ret) return ret; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a1e00d0a2482..63d37ccce3dc 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -142,8 +142,11 @@ struct nested_vmx { * pointers, so we must keep them pinned while L2 runs. */ struct page *apic_access_page; - struct page *virtual_apic_page; - struct page *pi_desc_page; + struct kvm_host_map virtual_apic_map; + struct kvm_host_map pi_desc_map; + + struct kvm_host_map msr_bitmap_map; + struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; @@ -169,7 +172,7 @@ struct nested_vmx { } smm; gpa_t hv_evmcs_vmptr; - struct page *hv_evmcs_page; + struct kvm_host_map hv_evmcs_map; struct hv_enlightened_vmcs *hv_evmcs; }; @@ -257,6 +260,8 @@ struct vcpu_vmx { unsigned long host_debugctlmsr; + u64 msr_ia32_power_ctl; + /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included @@ -517,4 +522,6 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); } +void dump_vmcs(void); + #endif /* __KVM_X86_VMX_H */ |