diff options
Diffstat (limited to 'arch/x86/kvm/svm')
-rw-r--r-- | arch/x86/kvm/svm/avic.c | 163 | ||||
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 297 | ||||
-rw-r--r-- | arch/x86/kvm/svm/pmu.c | 25 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 287 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 840 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.h | 111 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm_onhyperv.h | 12 | ||||
-rw-r--r-- | arch/x86/kvm/svm/vmenter.S | 4 |
8 files changed, 1038 insertions, 701 deletions
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index affc0ea98d30..fb3e20791338 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -27,20 +27,6 @@ #include "irq.h" #include "svm.h" -#define SVM_AVIC_DOORBELL 0xc001011b - -#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) - -/* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. - */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 255 - -#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 -#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 -#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF - /* AVIC GATAG is encoded using VM and VCPU IDs */ #define AVIC_VCPU_ID_BITS 8 #define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) @@ -73,12 +59,6 @@ struct amd_svm_iommu_ir { void *data; /* Storing pointer to struct amd_ir_data */ }; -enum avic_ipi_failure_cause { - AVIC_IPI_FAILURE_INVALID_INT_TYPE, - AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, - AVIC_IPI_FAILURE_INVALID_TARGET, - AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, -}; /* Note: * This function is called from IOMMU driver to notify @@ -289,20 +269,44 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +void avic_ring_doorbell(struct kvm_vcpu *vcpu) +{ + /* + * Note, the vCPU could get migrated to a different pCPU at any point, + * which could result in signalling the wrong/previous pCPU. But if + * that happens the vCPU is guaranteed to do a VMRUN (after being + * migrated) and thus will process pending interrupts, i.e. a doorbell + * is not needed (and the spurious one is harmless). + */ + int cpu = READ_ONCE(vcpu->cpu); + + if (cpu != get_cpu()) + wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + put_cpu(); +} + static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; + /* + * Wake any target vCPUs that are blocking, i.e. waiting for a wake + * event. There's no need to signal doorbells, as hardware has handled + * vCPUs that were in guest at the time of the IPI, and vCPUs that have + * since entered the guest will have processed pending IRQs at VMRUN. + */ kvm_for_each_vcpu(i, vcpu, kvm) { - bool m = kvm_apic_match_dest(vcpu, source, - icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK); - - if (m && !avic_vcpu_is_running(vcpu)) - kvm_vcpu_wake_up(vcpu); + if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & APIC_DEST_MASK)) { + vcpu->arch.apic->irr_pending = true; + svm_complete_interrupt_delivery(vcpu, + icrl & APIC_MODE_MASK, + icrl & APIC_INT_LEVELTRIG, + icrl & APIC_VECTOR_MASK); + } } } @@ -342,8 +346,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh); break; case AVIC_IPI_FAILURE_INVALID_TARGET: - WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", - index, vcpu->vcpu_id, icrh, icrl); break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); @@ -666,26 +668,6 @@ void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) return; } -int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) -{ - if (!vcpu->arch.apicv_active) - return -1; - - kvm_lapic_set_irr(vec, vcpu->arch.apic); - smp_mb__after_atomic(); - - if (avic_vcpu_is_running(vcpu)) { - int cpuid = vcpu->cpu; - - if (cpuid != get_cpu()) - wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid)); - put_cpu(); - } else - kvm_vcpu_wake_up(vcpu); - - return 0; -} - bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) { return false; @@ -900,6 +882,7 @@ out: bool svm_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_ABSENT) | BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | @@ -948,6 +931,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_preemption_disabled(); + /* * Since the host physical APIC id is 8 bits, * we can support host APIC ID upto 255. @@ -955,19 +940,25 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; + /* + * No need to update anything if the vCPU is blocking, i.e. if the vCPU + * is being scheduled in after being preempted. The CPU entries in the + * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. + * If the vCPU was migrated, its new CPU value will be stuffed when the + * vCPU unblocks. + */ + if (kvm_vcpu_is_blocking(vcpu)) + return; + entry = READ_ONCE(*(svm->avic_physical_id_cache)); WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); - - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (svm->avic_is_running) - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, - svm->avic_is_running); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); } void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -975,40 +966,56 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) u64 entry; struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_preemption_disabled(); + entry = READ_ONCE(*(svm->avic_physical_id_cache)); - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + + /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) + return; + + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } -/* - * This function is called during VCPU halt/unhalt. - */ -static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +void avic_vcpu_blocking(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - - svm->avic_is_running = is_run; - if (!kvm_vcpu_apicv_active(vcpu)) return; - if (is_run) - avic_vcpu_load(vcpu, vcpu->cpu); - else - avic_vcpu_put(vcpu); -} + preempt_disable(); -void svm_vcpu_blocking(struct kvm_vcpu *vcpu) -{ - avic_set_running(vcpu, false); + /* + * Unload the AVIC when the vCPU is about to block, _before_ + * the vCPU actually blocks. + * + * Any IRQs that arrive before IsRunning=0 will not cause an + * incomplete IPI vmexit on the source, therefore vIRR will also + * be checked by kvm_vcpu_check_block() before blocking. The + * memory barrier implicit in set_current_state orders writing + * IsRunning=0 before reading the vIRR. The processor needs a + * matching memory barrier on interrupt delivery between writing + * IRR and reading IsRunning; the lack of this barrier might be + * the cause of errata #1235). + */ + avic_vcpu_put(vcpu); + + preempt_enable(); } -void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) +void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) { - if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) - kvm_vcpu_update_apicv(vcpu); - avic_set_running(vcpu, true); + int cpu; + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + cpu = get_cpu(); + WARN_ON(cpu != vcpu->cpu); + + avic_vcpu_load(vcpu, cpu); + + put_cpu(); } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index f8b7bc04b3e7..39d280e7e80e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -58,8 +58,9 @@ static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_excep struct vcpu_svm *svm = to_svm(vcpu); WARN_ON(!is_guest_mode(vcpu)); - if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && - !svm->nested.nested_run_pending) { + if (vmcb12_is_intercept(&svm->nested.ctl, + INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && + !svm->nested.nested_run_pending) { svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = fault->error_code; @@ -121,7 +122,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) void recalc_intercepts(struct vcpu_svm *svm) { - struct vmcb_control_area *c, *h, *g; + struct vmcb_control_area *c, *h; + struct vmcb_ctrl_area_cached *g; unsigned int i; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -163,37 +165,6 @@ void recalc_intercepts(struct vcpu_svm *svm) vmcb_set_intercept(c, INTERCEPT_VMSAVE); } -static void copy_vmcb_control_area(struct vmcb_control_area *dst, - struct vmcb_control_area *from) -{ - unsigned int i; - - for (i = 0; i < MAX_INTERCEPT; i++) - dst->intercepts[i] = from->intercepts[i]; - - dst->iopm_base_pa = from->iopm_base_pa; - dst->msrpm_base_pa = from->msrpm_base_pa; - dst->tsc_offset = from->tsc_offset; - /* asid not copied, it is handled manually for svm->vmcb. */ - dst->tlb_ctl = from->tlb_ctl; - dst->int_ctl = from->int_ctl; - dst->int_vector = from->int_vector; - dst->int_state = from->int_state; - dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; - dst->exit_info_1 = from->exit_info_1; - dst->exit_info_2 = from->exit_info_2; - dst->exit_int_info = from->exit_int_info; - dst->exit_int_info_err = from->exit_int_info_err; - dst->nested_ctl = from->nested_ctl; - dst->event_inj = from->event_inj; - dst->event_inj_err = from->event_inj_err; - dst->nested_cr3 = from->nested_cr3; - dst->virt_ext = from->virt_ext; - dst->pause_filter_count = from->pause_filter_count; - dst->pause_filter_thresh = from->pause_filter_thresh; -} - static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { /* @@ -203,7 +174,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) */ int i; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; for (i = 0; i < MSRPM_OFFSETS; i++) { @@ -250,10 +221,10 @@ static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl) } } -static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, - struct vmcb_control_area *control) +static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, + struct vmcb_ctrl_area_cached *control) { - if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN))) + if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN))) return false; if (CC(control->asid == 0)) @@ -275,9 +246,20 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, return true; } -static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, - struct vmcb_save_area *save) +/* Common checks that apply to both L1 and L2 state. */ +static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, + struct vmcb_save_area_cached *save) { + if (CC(!(save->efer & EFER_SVME))) + return false; + + if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || + CC(save->cr0 & ~0xffffffffULL)) + return false; + + if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) + return false; + /* * These checks are also performed by KVM_SET_SREGS, * except that EFER.LMA is not checked by SVM against @@ -293,48 +275,90 @@ static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) return false; + if (CC(!kvm_valid_efer(vcpu, save->efer))) + return false; + return true; } -/* Common checks that apply to both L1 and L2 state. */ -static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, - struct vmcb_save_area *save) +static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu) { - /* - * FIXME: these should be done after copying the fields, - * to avoid TOC/TOU races. For these save area checks - * the possible damage is limited since kvm_set_cr0 and - * kvm_set_cr4 handle failure; EFER_SVME is an exception - * so it is force-set later in nested_prepare_vmcb_save. - */ - if (CC(!(save->efer & EFER_SVME))) - return false; + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area_cached *save = &svm->nested.save; - if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || - CC(save->cr0 & ~0xffffffffULL)) - return false; + return __nested_vmcb_check_save(vcpu, save); +} - if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) - return false; +static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl; - if (!nested_vmcb_check_cr3_cr4(vcpu, save)) - return false; + return __nested_vmcb_check_controls(vcpu, ctl); +} - if (CC(!kvm_valid_efer(vcpu, save->efer))) - return false; +static +void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to, + struct vmcb_control_area *from) +{ + unsigned int i; - return true; + for (i = 0; i < MAX_INTERCEPT; i++) + to->intercepts[i] = from->intercepts[i]; + + to->iopm_base_pa = from->iopm_base_pa; + to->msrpm_base_pa = from->msrpm_base_pa; + to->tsc_offset = from->tsc_offset; + to->tlb_ctl = from->tlb_ctl; + to->int_ctl = from->int_ctl; + to->int_vector = from->int_vector; + to->int_state = from->int_state; + to->exit_code = from->exit_code; + to->exit_code_hi = from->exit_code_hi; + to->exit_info_1 = from->exit_info_1; + to->exit_info_2 = from->exit_info_2; + to->exit_int_info = from->exit_int_info; + to->exit_int_info_err = from->exit_int_info_err; + to->nested_ctl = from->nested_ctl; + to->event_inj = from->event_inj; + to->event_inj_err = from->event_inj_err; + to->nested_cr3 = from->nested_cr3; + to->virt_ext = from->virt_ext; + to->pause_filter_count = from->pause_filter_count; + to->pause_filter_thresh = from->pause_filter_thresh; + + /* Copy asid here because nested_vmcb_check_controls will check it. */ + to->asid = from->asid; + to->msrpm_base_pa &= ~0x0fffULL; + to->iopm_base_pa &= ~0x0fffULL; } -void nested_load_control_from_vmcb12(struct vcpu_svm *svm, - struct vmcb_control_area *control) +void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, + struct vmcb_control_area *control) { - copy_vmcb_control_area(&svm->nested.ctl, control); + __nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control); +} - /* Copy it here because nested_svm_check_controls will check it. */ - svm->nested.ctl.asid = control->asid; - svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL; - svm->nested.ctl.iopm_base_pa &= ~0x0fffULL; +static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, + struct vmcb_save_area *from) +{ + /* + * Copy only fields that are validated, as we need them + * to avoid TOC/TOU races. + */ + to->efer = from->efer; + to->cr0 = from->cr0; + to->cr3 = from->cr3; + to->cr4 = from->cr4; + + to->dr6 = from->dr6; + to->dr7 = from->dr7; +} + +void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, + struct vmcb_save_area *save) +{ + __nested_copy_vmcb_save_to_cache(&svm->nested.save, save); } /* @@ -437,14 +461,13 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return -EINVAL; if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) && - CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) + CC(!load_pdptrs(vcpu, cr3))) return -EINVAL; if (!nested_npt) kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); @@ -490,15 +513,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - /* - * Force-set EFER_SVME even though it is checked earlier on the - * VMCB12, because the guest can flip the bit between the check - * and now. Clearing EFER_SVME would call svm_free_nested. - */ - svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); + svm_set_efer(&svm->vcpu, svm->nested.save.efer); - svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); - svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); + svm_set_cr0(&svm->vcpu, svm->nested.save.cr0); + svm_set_cr4(&svm->vcpu, svm->nested.save.cr4); svm->vcpu.arch.cr2 = vmcb12->save.cr2; @@ -513,8 +531,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 /* These bits will be set properly on the first execution when new_vmc12 is true */ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { - svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; - svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; + svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; + svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; vmcb_mark_dirty(svm->vmcb, VMCB_DR); } } @@ -628,7 +646,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_vmcb02_prepare_control(svm); nested_vmcb02_prepare_save(svm, vmcb12); - ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, + ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, nested_npt_enabled(svm), from_vmrun); if (ret) return ret; @@ -678,10 +696,11 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - nested_load_control_from_vmcb12(svm, &vmcb12->control); + nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); + nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); - if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || - !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { + if (!nested_vmcb_check_save(vcpu) || + !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; @@ -964,9 +983,9 @@ void svm_free_nested(struct vcpu_svm *svm) /* * Forcibly leave nested mode in order to be able to reset the VCPU later on. */ -void svm_leave_nested(struct vcpu_svm *svm) +void svm_leave_nested(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); if (is_guest_mode(vcpu)) { svm->nested.nested_run_pending = 0; @@ -988,7 +1007,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) u32 offset, msr, value; int write, mask; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; @@ -1015,7 +1034,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) u8 start_bit; u64 gpa; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; @@ -1046,12 +1065,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) vmexit = nested_svm_intercept_ioio(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } @@ -1069,7 +1088,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; } default: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; } } @@ -1147,7 +1166,7 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) static inline bool nested_exit_on_init(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } static int svm_check_nested_events(struct kvm_vcpu *vcpu) @@ -1251,11 +1270,47 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); } +/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */ +static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, + struct vmcb_ctrl_area_cached *from) +{ + unsigned int i; + + memset(dst, 0, sizeof(struct vmcb_control_area)); + + for (i = 0; i < MAX_INTERCEPT; i++) + dst->intercepts[i] = from->intercepts[i]; + + dst->iopm_base_pa = from->iopm_base_pa; + dst->msrpm_base_pa = from->msrpm_base_pa; + dst->tsc_offset = from->tsc_offset; + dst->asid = from->asid; + dst->tlb_ctl = from->tlb_ctl; + dst->int_ctl = from->int_ctl; + dst->int_vector = from->int_vector; + dst->int_state = from->int_state; + dst->exit_code = from->exit_code; + dst->exit_code_hi = from->exit_code_hi; + dst->exit_info_1 = from->exit_info_1; + dst->exit_info_2 = from->exit_info_2; + dst->exit_int_info = from->exit_int_info; + dst->exit_int_info_err = from->exit_int_info_err; + dst->nested_ctl = from->nested_ctl; + dst->event_inj = from->event_inj; + dst->event_inj_err = from->event_inj_err; + dst->nested_cr3 = from->nested_cr3; + dst->virt_ext = from->virt_ext; + dst->pause_filter_count = from->pause_filter_count; + dst->pause_filter_thresh = from->pause_filter_thresh; +} + static int svm_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, u32 user_data_size) { struct vcpu_svm *svm; + struct vmcb_control_area *ctl; + unsigned long r; struct kvm_nested_state kvm_state = { .flags = 0, .format = KVM_STATE_NESTED_FORMAT_SVM, @@ -1297,9 +1352,18 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, */ if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE)) return -EFAULT; - if (copy_to_user(&user_vmcb->control, &svm->nested.ctl, - sizeof(user_vmcb->control))) + + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + if (!ctl) + return -ENOMEM; + + nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl); + r = copy_to_user(&user_vmcb->control, ctl, + sizeof(user_vmcb->control)); + kfree(ctl); + if (r) return -EFAULT; + if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save, sizeof(user_vmcb->save))) return -EFAULT; @@ -1316,6 +1380,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, &user_kvm_nested_state->data.svm[0]; struct vmcb_control_area *ctl; struct vmcb_save_area *save; + struct vmcb_save_area_cached save_cached; + struct vmcb_ctrl_area_cached ctl_cached; unsigned long cr0; int ret; @@ -1345,7 +1411,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { - svm_leave_nested(svm); + svm_leave_nested(vcpu); svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); return 0; } @@ -1368,7 +1434,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; ret = -EINVAL; - if (!nested_vmcb_check_controls(vcpu, ctl)) + __nested_copy_vmcb_control_to_cache(&ctl_cached, ctl); + if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) goto out_free; /* @@ -1383,22 +1450,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * Validate host state saved from before VMRUN (see * nested_svm_check_permissions). */ + __nested_copy_vmcb_save_to_cache(&save_cached, save); if (!(save->cr0 & X86_CR0_PG) || !(save->cr0 & X86_CR0_PE) || (save->rflags & X86_EFLAGS_VM) || - !nested_vmcb_valid_sregs(vcpu, save)) - goto out_free; - - /* - * While the nested guest CR3 is already checked and set by - * KVM_SET_SREGS, it was set when nested state was yet loaded, - * thus MMU might not be initialized correctly. - * Set it again to fix this. - */ - - ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, - nested_npt_enabled(svm), false); - if (WARN_ON_ONCE(ret)) + !__nested_vmcb_check_save(vcpu, &save_cached)) goto out_free; @@ -1410,7 +1466,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ if (is_guest_mode(vcpu)) - svm_leave_nested(svm); + svm_leave_nested(vcpu); else svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; @@ -1422,10 +1478,24 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save); - nested_load_control_from_vmcb12(svm, ctl); + nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); nested_vmcb02_prepare_control(svm); + + /* + * While the nested guest CR3 is already checked and set by + * KVM_SET_SREGS, it was set when nested state was yet loaded, + * thus MMU might not be initialized correctly. + * Set it again to fix this. + */ + + ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, + nested_npt_enabled(svm), false); + if (WARN_ON_ONCE(ret)) + goto out_free; + + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: @@ -1449,7 +1519,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) * the guest CR3 might be restored prior to setting the nested * state which can lead to a load of wrong PDPTRs. */ - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; if (!nested_svm_vmrun_msrpm(svm)) { @@ -1464,6 +1534,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) } struct kvm_x86_nested_ops svm_nested_ops = { + .leave_nested = svm_leave_nested, .check_events = svm_check_nested_events, .triple_fault = nested_svm_triple_fault, .get_nested_state_pages = svm_get_nested_state_pages, diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 871c426ec389..5aa45f13b16d 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -16,6 +16,7 @@ #include "cpuid.h" #include "lapic.h" #include "pmu.h" +#include "svm.h" enum pmu_type { PMU_TYPE_COUNTER = 0, @@ -100,6 +101,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + if (!enable_pmu) + return NULL; + switch (msr) { case MSR_F15H_PERF_CTL0: case MSR_F15H_PERF_CTL1: @@ -134,12 +138,16 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, return &pmu->gp_counters[msr_to_index(msr)]; } -static unsigned amd_find_arch_event(struct kvm_pmu *pmu, - u8 event_select, - u8 unit_mask) +static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) { + u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; + u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; + /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ + if (WARN_ON(pmc_is_fixed(pmc))) + return PERF_COUNT_HW_MAX; + for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) if (amd_event_mapping[i].eventsel == event_select && amd_event_mapping[i].unit_mask == unit_mask) @@ -151,12 +159,6 @@ static unsigned amd_find_arch_event(struct kvm_pmu *pmu, return amd_event_mapping[i].event_type; } -/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ -static unsigned amd_find_fixed_event(int idx) -{ - return PERF_COUNT_HW_MAX; -} - /* check if a PMC is enabled by comparing it against global_ctrl bits. Because * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE). */ @@ -281,7 +283,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; - pmu->reserved_bits = 0xffffffff00200000ull; + pmu->reserved_bits = 0xfffffff000280000ull; pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; @@ -319,8 +321,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops amd_pmu_ops = { - .find_arch_event = amd_find_arch_event, - .find_fixed_event = amd_find_fixed_event, + .pmc_perf_hw_id = amd_pmc_perf_hw_id, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 902c52a8dd0c..17b53457d866 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -237,7 +237,6 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - bool es_active = argp->id == KVM_SEV_ES_INIT; int asid, ret; if (kvm->created_vcpus) @@ -247,7 +246,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (unlikely(sev->active)) return ret; - sev->es_active = es_active; + sev->active = true; + sev->es_active = argp->id == KVM_SEV_ES_INIT; asid = sev_asid_new(sev); if (asid < 0) goto e_no_asid; @@ -257,8 +257,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (ret) goto e_free; - sev->active = true; - sev->asid = asid; INIT_LIST_HEAD(&sev->regions_list); return 0; @@ -268,6 +266,7 @@ e_free: sev->asid = 0; e_no_asid: sev->es_active = false; + sev->active = false; return ret; } @@ -637,7 +636,8 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_vcpu *vcpu; - int i, ret; + unsigned long i; + int ret; if (!sev_es_guest(kvm)) return -ENOTTY; @@ -1530,7 +1530,7 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } -static bool cmd_allowed_from_miror(u32 cmd_id) +static bool is_cmd_allowed_from_mirror(u32 cmd_id) { /* * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES @@ -1544,35 +1544,57 @@ static bool cmd_allowed_from_miror(u32 cmd_id) return false; } -static int sev_lock_for_migration(struct kvm *kvm) +static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + int r = -EBUSY; + + if (dst_kvm == src_kvm) + return -EINVAL; /* - * Bail if this VM is already involved in a migration to avoid deadlock - * between two VMs trying to migrate to/from each other. + * Bail if these VMs are already involved in a migration to avoid + * deadlock between two VMs trying to migrate to/from each other. */ - if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1)) + if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) return -EBUSY; - mutex_lock(&kvm->lock); + if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) + goto release_dst; + r = -EINTR; + if (mutex_lock_killable(&dst_kvm->lock)) + goto release_src; + if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING)) + goto unlock_dst; return 0; + +unlock_dst: + mutex_unlock(&dst_kvm->lock); +release_src: + atomic_set_release(&src_sev->migration_in_progress, 0); +release_dst: + atomic_set_release(&dst_sev->migration_in_progress, 0); + return r; } -static void sev_unlock_after_migration(struct kvm *kvm) +static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; - mutex_unlock(&kvm->lock); - atomic_set_release(&sev->migration_in_progress, 0); + mutex_unlock(&dst_kvm->lock); + mutex_unlock(&src_kvm->lock); + atomic_set_release(&dst_sev->migration_in_progress, 0); + atomic_set_release(&src_sev->migration_in_progress, 0); } static int sev_lock_vcpus_for_migration(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i, j; + unsigned long i, j; kvm_for_each_vcpu(i, vcpu, kvm) { if (mutex_lock_killable(&vcpu->mutex)) @@ -1594,7 +1616,7 @@ out_unlock: static void sev_unlock_vcpus_for_migration(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { mutex_unlock(&vcpu->mutex); @@ -1608,19 +1630,20 @@ static void sev_migrate_from(struct kvm_sev_info *dst, dst->asid = src->asid; dst->handle = src->handle; dst->pages_locked = src->pages_locked; + dst->enc_context_owner = src->enc_context_owner; src->asid = 0; src->active = false; src->handle = 0; src->pages_locked = 0; + src->enc_context_owner = NULL; - INIT_LIST_HEAD(&dst->regions_list); - list_replace_init(&src->regions_list, &dst->regions_list); + list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); } static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) { - int i; + unsigned long i; struct kvm_vcpu *dst_vcpu, *src_vcpu; struct vcpu_svm *dst_svm, *src_svm; @@ -1667,15 +1690,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) bool charged = false; int ret; - ret = sev_lock_for_migration(kvm); - if (ret) - return ret; - - if (sev_guest(kvm)) { - ret = -EINVAL; - goto out_unlock; - } - source_kvm_file = fget(source_fd); if (!file_is_kvm(source_kvm_file)) { ret = -EBADF; @@ -1683,16 +1697,26 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) } source_kvm = source_kvm_file->private_data; - ret = sev_lock_for_migration(source_kvm); + ret = sev_lock_two_vms(kvm, source_kvm); if (ret) goto out_fput; - if (!sev_guest(source_kvm)) { + if (sev_guest(kvm) || !sev_guest(source_kvm)) { ret = -EINVAL; - goto out_source; + goto out_unlock; } src_sev = &to_kvm_svm(source_kvm)->sev_info; + + /* + * VMs mirroring src's encryption context rely on it to keep the + * ASID allocated, but below we are clearing src_sev->asid. + */ + if (src_sev->num_mirrored_vms) { + ret = -EBUSY; + goto out_unlock; + } + dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; if (dst_sev->misc_cg != src_sev->misc_cg) { @@ -1729,13 +1753,11 @@ out_dst_cgroup: sev_misc_cg_uncharge(cg_cleanup_sev); put_misc_cg(cg_cleanup_sev->misc_cg); cg_cleanup_sev->misc_cg = NULL; -out_source: - sev_unlock_after_migration(source_kvm); +out_unlock: + sev_unlock_two_vms(kvm, source_kvm); out_fput: if (source_kvm_file) fput(source_kvm_file); -out_unlock: - sev_unlock_after_migration(kvm); return ret; } @@ -1757,7 +1779,7 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) /* Only the enc_context_owner handles some memory enc operations. */ if (is_mirroring_enc_context(kvm) && - !cmd_allowed_from_miror(sev_cmd.id)) { + !is_cmd_allowed_from_mirror(sev_cmd.id)) { r = -EINVAL; goto out; } @@ -1954,71 +1976,60 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; - struct kvm_sev_info source_sev, *mirror_sev; + struct kvm_sev_info *source_sev, *mirror_sev; int ret; source_kvm_file = fget(source_fd); if (!file_is_kvm(source_kvm_file)) { ret = -EBADF; - goto e_source_put; + goto e_source_fput; } source_kvm = source_kvm_file->private_data; - mutex_lock(&source_kvm->lock); - - if (!sev_guest(source_kvm)) { - ret = -EINVAL; - goto e_source_unlock; - } + ret = sev_lock_two_vms(kvm, source_kvm); + if (ret) + goto e_source_fput; - /* Mirrors of mirrors should work, but let's not get silly */ - if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) { + /* + * Mirrors of mirrors should work, but let's not get silly. Also + * disallow out-of-band SEV/SEV-ES init if the target is already an + * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being + * created after SEV/SEV-ES initialization, e.g. to init intercepts. + */ + if (sev_guest(kvm) || !sev_guest(source_kvm) || + is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) { ret = -EINVAL; - goto e_source_unlock; + goto e_unlock; } - memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info, - sizeof(source_sev)); - /* * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ + source_sev = &to_kvm_svm(source_kvm)->sev_info; kvm_get_kvm(source_kvm); - - fput(source_kvm_file); - mutex_unlock(&source_kvm->lock); - mutex_lock(&kvm->lock); - - if (sev_guest(kvm)) { - ret = -EINVAL; - goto e_mirror_unlock; - } + source_sev->num_mirrored_vms++; /* Set enc_context_owner and copy its encryption context over */ mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; mirror_sev->active = true; - mirror_sev->asid = source_sev.asid; - mirror_sev->fd = source_sev.fd; - mirror_sev->es_active = source_sev.es_active; - mirror_sev->handle = source_sev.handle; + mirror_sev->asid = source_sev->asid; + mirror_sev->fd = source_sev->fd; + mirror_sev->es_active = source_sev->es_active; + mirror_sev->handle = source_sev->handle; + INIT_LIST_HEAD(&mirror_sev->regions_list); + ret = 0; + /* * Do not copy ap_jump_table. Since the mirror does not share the same * KVM contexts as the original, and they may have different * memory-views. */ - mutex_unlock(&kvm->lock); - return 0; - -e_mirror_unlock: - mutex_unlock(&kvm->lock); - kvm_put_kvm(source_kvm); - return ret; -e_source_unlock: - mutex_unlock(&source_kvm->lock); -e_source_put: +e_unlock: + sev_unlock_two_vms(kvm, source_kvm); +e_source_fput: if (source_kvm_file) fput(source_kvm_file); return ret; @@ -2030,17 +2041,24 @@ void sev_vm_destroy(struct kvm *kvm) struct list_head *head = &sev->regions_list; struct list_head *pos, *q; + WARN_ON(sev->num_mirrored_vms); + if (!sev_guest(kvm)) return; /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ if (is_mirroring_enc_context(kvm)) { - kvm_put_kvm(sev->enc_context_owner); + struct kvm *owner_kvm = sev->enc_context_owner; + struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info; + + mutex_lock(&owner_kvm->lock); + if (!WARN_ON(!owner_sev->num_mirrored_vms)) + owner_sev->num_mirrored_vms--; + mutex_unlock(&owner_kvm->lock); + kvm_put_kvm(owner_kvm); return; } - mutex_lock(&kvm->lock); - /* * Ensure that all guest tagged cache entries are flushed before * releasing the pages back to the system for use. CLFLUSH will @@ -2060,8 +2078,6 @@ void sev_vm_destroy(struct kvm *kvm) } } - mutex_unlock(&kvm->lock); - sev_unbind_asid(kvm, sev->handle); sev_asid_free(sev); } @@ -2084,8 +2100,13 @@ void __init sev_hardware_setup(void) if (!sev_enabled || !npt_enabled) goto out; - /* Does the CPU support SEV? */ - if (!boot_cpu_has(X86_FEATURE_SEV)) + /* + * SEV must obviously be supported in hardware. Sanity check that the + * CPU supports decode assists, which is mandatory for SEV guests to + * support instruction emulation. + */ + if (!boot_cpu_has(X86_FEATURE_SEV) || + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS))) goto out; /* Retrieve SEV CPUID information */ @@ -2245,7 +2266,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) __free_page(virt_to_page(svm->sev_es.vmsa)); if (svm->sev_es.ghcb_sa_free) - kfree(svm->sev_es.ghcb_sa); + kvfree(svm->sev_es.ghcb_sa); } static void dump_ghcb(struct vcpu_svm *svm) @@ -2337,24 +2358,29 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static int sev_es_validate_vmgexit(struct vcpu_svm *svm) +static bool sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu; struct ghcb *ghcb; - u64 exit_code = 0; + u64 exit_code; + u64 reason; ghcb = svm->sev_es.ghcb; - /* Only GHCB Usage code 0 is supported */ - if (ghcb->ghcb_usage) - goto vmgexit_err; - /* - * Retrieve the exit code now even though is may not be marked valid + * Retrieve the exit code now even though it may not be marked valid * as it could help with debugging. */ exit_code = ghcb_get_sw_exit_code(ghcb); + /* Only GHCB Usage code 0 is supported */ + if (ghcb->ghcb_usage) { + reason = GHCB_ERR_INVALID_USAGE; + goto vmgexit_err; + } + + reason = GHCB_ERR_MISSING_INPUT; + if (!ghcb_sw_exit_code_is_valid(ghcb) || !ghcb_sw_exit_info_1_is_valid(ghcb) || !ghcb_sw_exit_info_2_is_valid(ghcb)) @@ -2433,30 +2459,34 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_VMGEXIT_UNSUPPORTED_EVENT: break; default: + reason = GHCB_ERR_INVALID_EVENT; goto vmgexit_err; } - return 0; + return true; vmgexit_err: vcpu = &svm->vcpu; - if (ghcb->ghcb_usage) { + if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", ghcb->ghcb_usage); + } else if (reason == GHCB_ERR_INVALID_EVENT) { + vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", + exit_code); } else { - vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n", + vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", exit_code); dump_ghcb(svm); } - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_code; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + /* Clear the valid entries fields */ + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); + + ghcb_set_sw_exit_info_1(ghcb, 2); + ghcb_set_sw_exit_info_2(ghcb, reason); - return -EINVAL; + return false; } void sev_es_unmap_ghcb(struct vcpu_svm *svm) @@ -2478,7 +2508,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) svm->sev_es.ghcb_sa_sync = false; } - kfree(svm->sev_es.ghcb_sa); + kvfree(svm->sev_es.ghcb_sa); svm->sev_es.ghcb_sa = NULL; svm->sev_es.ghcb_sa_free = false; } @@ -2526,14 +2556,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) scratch_gpa_beg = ghcb_get_sw_scratch(ghcb); if (!scratch_gpa_beg) { pr_err("vmgexit: scratch gpa not provided\n"); - return false; + goto e_scratch; } scratch_gpa_end = scratch_gpa_beg + len; if (scratch_gpa_end < scratch_gpa_beg) { pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n", len, scratch_gpa_beg); - return false; + goto e_scratch; } if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { @@ -2551,7 +2581,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) scratch_gpa_end > ghcb_scratch_end) { pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n", scratch_gpa_beg, scratch_gpa_end); - return false; + goto e_scratch; } scratch_va = (void *)svm->sev_es.ghcb; @@ -2564,18 +2594,18 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) if (len > GHCB_SCRATCH_AREA_LIMIT) { pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n", len, GHCB_SCRATCH_AREA_LIMIT); - return false; + goto e_scratch; } - scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT); + scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT); if (!scratch_va) - return false; + goto e_scratch; if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { /* Unable to copy scratch area from guest */ pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); - kfree(scratch_va); - return false; + kvfree(scratch_va); + goto e_scratch; } /* @@ -2592,6 +2622,12 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) svm->sev_es.ghcb_sa_len = len; return true; + +e_scratch: + ghcb_set_sw_exit_info_1(ghcb, 2); + ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); + + return false; } static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask, @@ -2642,7 +2678,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID); if (!ret) { - ret = -EINVAL; + /* Error, keep GHCB MSR value as-is */ break; } @@ -2678,10 +2714,13 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) GHCB_MSR_TERM_REASON_POS); pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - fallthrough; + + ret = -EINVAL; + break; } default: - ret = -EINVAL; + /* Error, keep GHCB MSR value as-is */ + break; } trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id, @@ -2705,14 +2744,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) if (!ghcb_gpa) { vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n"); - return -EINVAL; + + /* Without a GHCB, just return right back to the guest */ + return 1; } if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) { /* Unable to map GHCB from guest */ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", ghcb_gpa); - return -EINVAL; + + /* Without a GHCB, just return right back to the guest */ + return 1; } svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; @@ -2722,15 +2765,14 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) exit_code = ghcb_get_sw_exit_code(ghcb); - ret = sev_es_validate_vmgexit(svm); - if (ret) - return ret; + if (!sev_es_validate_vmgexit(svm)) + return 1; sev_es_sync_from_ghcb(svm); ghcb_set_sw_exit_info_1(ghcb, 0); ghcb_set_sw_exit_info_2(ghcb, 0); - ret = -EINVAL; + ret = 1; switch (exit_code) { case SVM_VMGEXIT_MMIO_READ: if (!setup_vmgexit_scratch(svm, true, control->exit_info_2)) @@ -2771,20 +2813,17 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) default: pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", control->exit_info_1); - ghcb_set_sw_exit_info_1(ghcb, 1); - ghcb_set_sw_exit_info_2(ghcb, - X86_TRAP_UD | - SVM_EVTINJ_TYPE_EXEPT | - SVM_EVTINJ_VALID); + ghcb_set_sw_exit_info_1(ghcb, 2); + ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT); } - ret = 1; break; } case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", control->exit_info_1, control->exit_info_2); + ret = -EINVAL; break; default: ret = svm_invoke_exit_handler(vcpu, exit_code); @@ -2806,7 +2845,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) return -EINVAL; if (!setup_vmgexit_scratch(svm, in, bytes)) - return -EINVAL; + return 1; return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, count, in); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5630c241d5f6..fd3a00c892c7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -265,7 +265,7 @@ u32 svm_msrpm_offset(u32 msr) #define MAX_INST_SIZE 15 -static int get_max_npt_level(void) +static int get_npt_level(void) { #ifdef CONFIG_X86_64 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; @@ -290,7 +290,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { if (!(efer & EFER_SVME)) { - svm_leave_nested(svm); + svm_leave_nested(vcpu); svm_set_gif(svm, true); /* #GP intercept is still needed for vmware backdoor */ if (!enable_vmware_backdoor) @@ -312,7 +312,11 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) return ret; } - if (svm_gp_erratum_intercept) + /* + * Never intercept #GP for SEV guests, KVM can't + * decrypt guest memory to workaround the erratum. + */ + if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) set_exception_intercept(svm, GP_VECTOR); } } @@ -585,12 +589,10 @@ static int svm_cpu_init(int cpu) if (!sd) return ret; sd->cpu = cpu; - sd->save_area = alloc_page(GFP_KERNEL); + sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!sd->save_area) goto free_cpu_data; - clear_page(page_address(sd->save_area)); - ret = sev_cpu_init(sd); if (ret) goto free_save_area; @@ -871,47 +873,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -/* - * The default MMIO mask is a single bit (excluding the present bit), - * which could conflict with the memory encryption bit. Check for - * memory encryption support and override the default MMIO mask if - * memory encryption is enabled. - */ -static __init void svm_adjust_mmio_mask(void) -{ - unsigned int enc_bit, mask_bit; - u64 msr, mask; - - /* If there is no memory encryption support, use existing mask */ - if (cpuid_eax(0x80000000) < 0x8000001f) - return; - - /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); - if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) - return; - - enc_bit = cpuid_ebx(0x8000001f) & 0x3f; - mask_bit = boot_cpu_data.x86_phys_bits; - - /* Increment the mask bit if it is the same as the encryption bit */ - if (enc_bit == mask_bit) - mask_bit++; - - /* - * If the mask bit location is below 52, then some bits above the - * physical addressing limit will always be reserved, so use the - * rsvd_bits() function to generate the mask. This mask, along with - * the present bit, will be used to generate a page fault with - * PFER.RSV = 1. - * - * If the mask bit location is 52 (or above), then clear the mask. - */ - mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; - - kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); -} - static void svm_hardware_teardown(void) { int cpu; @@ -926,191 +887,6 @@ static void svm_hardware_teardown(void) iopm_base = 0; } -static __init void svm_set_cpu_caps(void) -{ - kvm_set_cpu_caps(); - - supported_xss = 0; - - /* CPUID 0x80000001 and 0x8000000A (SVM features) */ - if (nested) { - kvm_cpu_cap_set(X86_FEATURE_SVM); - - if (nrips) - kvm_cpu_cap_set(X86_FEATURE_NRIPS); - - if (npt_enabled) - kvm_cpu_cap_set(X86_FEATURE_NPT); - - if (tsc_scaling) - kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); - - /* Nested VM can receive #VMEXIT instead of triggering #GP */ - kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); - } - - /* CPUID 0x80000008 */ - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || - boot_cpu_has(X86_FEATURE_AMD_SSBD)) - kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - - /* CPUID 0x8000001F (SME/SEV features) */ - sev_set_cpu_caps(); -} - -static __init int svm_hardware_setup(void) -{ - int cpu; - struct page *iopm_pages; - void *iopm_va; - int r; - unsigned int order = get_order(IOPM_SIZE); - - /* - * NX is required for shadow paging and for NPT if the NX huge pages - * mitigation is enabled. - */ - if (!boot_cpu_has(X86_FEATURE_NX)) { - pr_err_ratelimited("NX (Execute Disable) not supported\n"); - return -EOPNOTSUPP; - } - kvm_enable_efer_bits(EFER_NX); - - iopm_pages = alloc_pages(GFP_KERNEL, order); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; - - init_msrpm_offsets(); - - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); - - if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) - kvm_enable_efer_bits(EFER_FFXSR); - - if (tsc_scaling) { - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - tsc_scaling = false; - } else { - pr_info("TSC scaling supported\n"); - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; - } - } - - tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); - - /* Check for pause filtering support */ - if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { - pause_filter_count = 0; - pause_filter_thresh = 0; - } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { - pause_filter_thresh = 0; - } - - if (nested) { - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); - } - - /* - * KVM's MMU doesn't support using 2-level paging for itself, and thus - * NPT isn't supported if the host is using 2-level paging since host - * CR4 is unchanged on VMRUN. - */ - if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) - npt_enabled = false; - - if (!boot_cpu_has(X86_FEATURE_NPT)) - npt_enabled = false; - - /* Force VM NPT level equal to the host's max NPT level */ - kvm_configure_mmu(npt_enabled, get_max_npt_level(), - get_max_npt_level(), PG_LEVEL_1G); - pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); - - /* Note, SEV setup consumes npt_enabled. */ - sev_hardware_setup(); - - svm_hv_hardware_setup(); - - svm_adjust_mmio_mask(); - - for_each_possible_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err; - } - - if (nrips) { - if (!boot_cpu_has(X86_FEATURE_NRIPS)) - nrips = false; - } - - enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); - - if (enable_apicv) { - pr_info("AVIC enabled\n"); - - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } - - if (vls) { - if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || - !IS_ENABLED(CONFIG_X86_64)) { - vls = false; - } else { - pr_info("Virtual VMLOAD VMSAVE supported\n"); - } - } - - if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) - svm_gp_erratum_intercept = false; - - if (vgif) { - if (!boot_cpu_has(X86_FEATURE_VGIF)) - vgif = false; - else - pr_info("Virtual GIF supported\n"); - } - - if (lbrv) { - if (!boot_cpu_has(X86_FEATURE_LBRV)) - lbrv = false; - else - pr_info("LBR virtualization supported\n"); - } - - svm_set_cpu_caps(); - - /* - * It seems that on AMD processors PTE's accessed bit is - * being set by the CPU hardware before the NPF vmexit. - * This is not expected behaviour and our tests fail because - * of it. - * A workaround here is to disable support for - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. - * In this case userspace can know if there is support using - * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle - * it - * If future AMD CPU models change the behaviour described above, - * this variable can be changed accordingly - */ - allow_smaller_maxphyaddr = !npt_enabled; - - return 0; - -err: - svm_hardware_teardown(); - return r; -} - static void init_seg(struct vmcb_seg *seg) { seg->selector = 0; @@ -1238,9 +1014,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu) * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway - * as VMware does. + * as VMware does. Don't intercept #GP for SEV guests as KVM can't + * decrypt guest memory to decode the faulting instruction. */ - if (enable_vmware_backdoor) + if (enable_vmware_backdoor && !sev_guest(vcpu->kvm)) set_exception_intercept(svm, GP_VECTOR); svm_set_intercept(svm, INTERCEPT_INTR); @@ -1435,12 +1212,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (err) goto error_free_vmsa_page; - /* We initialize this flag to true to make sure that the is_running - * bit would be set the first time the vcpu is loaded. - */ - if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) - svm->avic_is_running = true; - svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) { err = -ENOMEM; @@ -1585,12 +1356,27 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static bool svm_get_if_flag(struct kvm_vcpu *vcpu) +{ + struct vmcb *vmcb = to_svm(vcpu)->vmcb; + + return sev_es_guest(vcpu->kvm) + ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK + : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; +} + static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_register_mark_available(vcpu, reg); + switch (reg) { case VCPU_EXREG_PDPTR: - BUG_ON(!npt_enabled); - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + /* + * When !npt_enabled, mmu->pdptrs[] is already available since + * it is always updated per SDM when moving to CRs. + */ + if (npt_enabled) + load_pdptrs(vcpu, kvm_read_cr3(vcpu)); break; default: KVM_BUG_ON(1, vcpu->kvm); @@ -1777,10 +1563,29 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) vmcb_mark_dirty(svm->vmcb, VMCB_DT); } +static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * For guests that don't set guest_state_protected, the cr3 update is + * handled via kvm_mmu_load() while entering the guest. For guests + * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to + * VMCB save area now, since the save area will become the initial + * contents of the VMSA, and future VMCB save area updates won't be + * seen. + */ + if (sev_es_guest(vcpu->kvm)) { + svm->vmcb->save.cr3 = cr3; + vmcb_mark_dirty(svm->vmcb, VMCB_CR); + } +} + void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); u64 hcr0 = cr0; + bool old_paging = is_paging(vcpu); #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { @@ -1797,8 +1602,11 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) #endif vcpu->arch.cr0 = cr0; - if (!npt_enabled) + if (!npt_enabled) { hcr0 |= X86_CR0_PG | X86_CR0_WP; + if (old_paging != is_paging(vcpu)) + svm_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } /* * re-enable caching here because the QEMU bios @@ -1842,8 +1650,12 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) svm_flush_tlb(vcpu); vcpu->arch.cr4 = cr4; - if (!npt_enabled) + if (!npt_enabled) { cr4 |= X86_CR4_PAE; + + if (!is_paging(vcpu)) + cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); + } cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); @@ -2292,10 +2104,6 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (error_code) goto reinject; - /* All SVM instructions expect page aligned RAX */ - if (svm->vmcb->save.rax & ~PAGE_MASK) - goto reinject; - /* Decode the instruction for usage later */ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) goto reinject; @@ -2313,8 +2121,13 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu)) return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); - } else + } else { + /* All SVM instructions expect page aligned RAX */ + if (svm->vmcb->save.rax & ~PAGE_MASK) + goto reinject; + return emulate_svm_instr(vcpu, opcode); + } reinject: kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); @@ -2508,7 +2321,7 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, bool ret = false; if (!is_guest_mode(vcpu) || - (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) + (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) return false; cr0 &= ~SVM_CR0_SELECTIVE_MASK; @@ -2880,8 +2693,23 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u64 data = msr->data; switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!msr->host_initiated && !svm->tsc_scaling_enabled) - return 1; + + if (!svm->tsc_scaling_enabled) { + + if (!msr->host_initiated) + return 1; + /* + * In case TSC scaling is not enabled, always + * leave this MSR at the default value. + * + * Due to bug in qemu 6.2.0, it would try to set + * this msr to 0 if tsc scaling is not enabled. + * Ignore this value as well. + */ + if (data != 0 && data != svm->tsc_ratio_msr) + return 1; + break; + } if (data & TSC_RATIO_RSVD) return 1; @@ -3486,6 +3314,55 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } +void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, + int trig_mode, int vector) +{ + /* + * vcpu->arch.apicv_active must be read after vcpu->mode. + * Pairs with smp_store_release in vcpu_enter_guest. + */ + bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); + + if (!READ_ONCE(vcpu->arch.apicv_active)) { + /* Process the interrupt via inject_pending_event */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + return; + } + + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); + if (in_guest_mode) { + /* + * Signal the doorbell to tell hardware to inject the IRQ. If + * the vCPU exits the guest before the doorbell chimes, hardware + * will automatically process AVIC interrupts at the next VMRUN. + */ + avic_ring_doorbell(vcpu); + } else { + /* + * Wake the vCPU if it was blocking. KVM will then detect the + * pending IRQ when checking if the vCPU has a wake event. + */ + kvm_vcpu_wake_up(vcpu); + } +} + +static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + kvm_lapic_set_irr(vector, apic); + + /* + * Pairs with the smp_mb_*() after setting vcpu->guest_mode in + * vcpu_enter_guest() to ensure the write to the vIRR is ordered before + * the read of guest_mode. This guarantees that either VMRUN will see + * and process the new vIRR entry, or that svm_complete_interrupt_delivery + * will signal the doorbell if the CPU has already entered the guest. + */ + smp_mb__after_atomic(); + svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector); +} + static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3533,11 +3410,13 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_nmi_blocked(vcpu)) + return 0; + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) return -EBUSY; - - return !svm_nmi_blocked(vcpu); + return 1; } static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -3568,14 +3447,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (!gif_set(svm)) return true; - if (sev_es_guest(vcpu->kvm)) { - /* - * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask - * bit to determine the state of the IF flag. - */ - if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK)) - return true; - } else if (is_guest_mode(vcpu)) { + if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) @@ -3586,7 +3458,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (nested_exit_on_intr(svm)) return false; } else { - if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) + if (!svm_get_if_flag(vcpu)) return true; } @@ -3596,9 +3468,13 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); + if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_interrupt_blocked(vcpu)) + return 0; + /* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events. @@ -3606,7 +3482,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) return -EBUSY; - return !svm_interrupt_blocked(vcpu); + return 1; } static void svm_enable_irq_window(struct kvm_vcpu *vcpu) @@ -3798,6 +3674,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(vcpu); } +static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + return 1; +} + static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && @@ -3812,7 +3693,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); unsigned long vmcb_pa = svm->current_vmcb->pa; - kvm_guest_enter_irqoff(); + guest_state_enter_irqoff(); if (sev_es_guest(vcpu->kvm)) { __svm_sev_es_vcpu_run(vmcb_pa); @@ -3832,7 +3713,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) vmload(__sme_page_pa(sd->save_area)); } - kvm_guest_exit_irqoff(); + guest_state_exit_irqoff(); } static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) @@ -3929,9 +3810,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; } + vcpu->arch.regs_dirty = 0; if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_before_interrupt(vcpu); + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); kvm_load_host_xsave_state(vcpu); stgi(); @@ -3963,8 +3845,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); - if (npt_enabled) - kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR); + vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; /* * We need to handle MC intercepts here before the vcpu has a chance to @@ -3994,9 +3875,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, hv_track_root_tdp(vcpu, root_hpa); - /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - return; cr3 = vcpu->arch.cr3; } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); @@ -4215,7 +4093,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, info->intercept == x86_intercept_clts) break; - if (!(vmcb_is_intercept(&svm->nested.ctl, + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))) break; @@ -4335,11 +4213,14 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_smi_blocked(vcpu)) + return 0; + /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) return -EBUSY; - return !svm_smi_blocked(vcpu); + return 1; } static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) @@ -4433,10 +4314,18 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) * Enter the nested guest now */ + vmcb_mark_all_dirty(svm->vmcb01.ptr); + vmcb12 = map.hva; - nested_load_control_from_vmcb12(svm, &vmcb12->control); + nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); + nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); + if (ret) + goto unmap_save; + + svm->nested.nested_run_pending = 1; + unmap_save: kvm_vcpu_unmap(vcpu, &map_save, true); unmap_map: @@ -4457,79 +4346,140 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) } } -static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { bool smep, smap, is_user; unsigned long cr4; + u64 error_code; + + /* Emulation is always possible when KVM has access to all guest state. */ + if (!sev_guest(vcpu->kvm)) + return true; + + /* #UD and #GP should never be intercepted for SEV guests. */ + WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | + EMULTYPE_TRAP_UD_FORCED | + EMULTYPE_VMWARE_GP)); /* - * When the guest is an SEV-ES guest, emulation is not possible. + * Emulation is impossible for SEV-ES guests as KVM doesn't have access + * to guest register state. */ if (sev_es_guest(vcpu->kvm)) return false; /* + * Emulation is possible if the instruction is already decoded, e.g. + * when completing I/O after returning from userspace. + */ + if (emul_type & EMULTYPE_NO_DECODE) + return true; + + /* + * Emulation is possible for SEV guests if and only if a prefilled + * buffer containing the bytes of the intercepted instruction is + * available. SEV guest memory is encrypted with a guest specific key + * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and + * decode garbage. + * + * Inject #UD if KVM reached this point without an instruction buffer. + * In practice, this path should never be hit by a well-behaved guest, + * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path + * is still theoretically reachable, e.g. via unaccelerated fault-like + * AVIC access, and needs to be handled by KVM to avoid putting the + * guest into an infinite loop. Injecting #UD is somewhat arbitrary, + * but its the least awful option given lack of insight into the guest. + */ + if (unlikely(!insn)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return false; + } + + /* + * Emulate for SEV guests if the insn buffer is not empty. The buffer + * will be empty if the DecodeAssist microcode cannot fetch bytes for + * the faulting instruction because the code fetch itself faulted, e.g. + * the guest attempted to fetch from emulated MMIO or a guest page + * table used to translate CS:RIP resides in emulated MMIO. + */ + if (likely(insn_len)) + return true; + + /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. * * Errata: - * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is - * possible that CPU microcode implementing DecodeAssist will fail - * to read bytes of instruction which caused #NPF. In this case, - * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly - * return 0 instead of the correct guest instruction bytes. - * - * This happens because CPU microcode reading instruction bytes - * uses a special opcode which attempts to read data using CPL=0 - * privileges. The microcode reads CS:RIP and if it hits a SMAP - * fault, it gives up and returns no instruction bytes. + * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is + * possible that CPU microcode implementing DecodeAssist will fail to + * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly + * be '0'. This happens because microcode reads CS:RIP using a _data_ + * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode + * gives up and does not fill the instruction bytes buffer. * - * Detection: - * We reach here in case CPU supports DecodeAssist, raised #NPF and - * returned 0 in GuestIntrBytes field of the VMCB. - * First, errata can only be triggered in case vCPU CR4.SMAP=1. - * Second, if vCPU CR4.SMEP=1, errata could only be triggered - * in case vCPU CPL==3 (Because otherwise guest would have triggered - * a SMEP fault instead of #NPF). - * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL. - * As most guests enable SMAP if they have also enabled SMEP, use above - * logic in order to attempt minimize false-positive of detecting errata - * while still preserving all cases semantic correctness. + * As above, KVM reaches this point iff the VM is an SEV guest, the CPU + * supports DecodeAssist, a #NPF was raised, KVM's page fault handler + * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the + * GuestIntrBytes field of the VMCB. * - * Workaround: - * To determine what instruction the guest was executing, the hypervisor - * will have to decode the instruction at the instruction pointer. + * This does _not_ mean that the erratum has been encountered, as the + * DecodeAssist will also fail if the load for CS:RIP hits a legitimate + * #PF, e.g. if the guest attempt to execute from emulated MMIO and + * encountered a reserved/not-present #PF. * - * In non SEV guest, hypervisor will be able to read the guest - * memory to decode the instruction pointer when insn_len is zero - * so we return true to indicate that decoding is possible. + * To hit the erratum, the following conditions must be true: + * 1. CR4.SMAP=1 (obviously). + * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot + * have been hit as the guest would have encountered a SMEP + * violation #PF, not a #NPF. + * 3. The #NPF is not due to a code fetch, in which case failure to + * retrieve the instruction bytes is legitimate (see abvoe). * - * But in the SEV guest, the guest memory is encrypted with the - * guest specific key and hypervisor will not be able to decode the - * instruction pointer so we will not able to workaround it. Lets - * print the error and request to kill the guest. + * In addition, don't apply the erratum workaround if the #NPF occurred + * while translating guest page tables (see below). */ - if (likely(!insn || insn_len)) - return true; - - /* - * If RIP is invalid, go ahead with emulation which will cause an - * internal error exit. - */ - if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) - return true; + error_code = to_svm(vcpu)->vmcb->control.exit_info_1; + if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) + goto resume_guest; cr4 = kvm_read_cr4(vcpu); smep = cr4 & X86_CR4_SMEP; smap = cr4 & X86_CR4_SMAP; is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { - if (!sev_guest(vcpu->kvm)) - return true; - pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + + /* + * If the fault occurred in userspace, arbitrarily inject #GP + * to avoid killing the guest and to hopefully avoid confusing + * the guest kernel too much, e.g. injecting #PF would not be + * coherent with respect to the guest's page tables. Request + * triple fault if the fault occurred in the kernel as there's + * no fault that KVM can inject without confusing the guest. + * In practice, the triple fault is moot as no sane SEV kernel + * will execute from user memory while also running with SMAP=1. + */ + if (is_user) + kvm_inject_gp(vcpu, 0); + else + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); } +resume_guest: + /* + * If the erratum was not hit, simply resume the guest and let it fault + * again. While awful, e.g. the vCPU may get stuck in an infinite loop + * if the fault is at CPL=0, it's the lesser of all evils. Exiting to + * userspace will kill the guest, and letting the emulator read garbage + * will yield random behavior and potentially corrupt the guest. + * + * Simply resuming the guest is technically not a violation of the SEV + * architecture. AMD's APM states that all code fetches and page table + * accesses for SEV guest are encrypted, regardless of the C-Bit. The + * APM also states that encrypted accesses to MMIO are "ignored", but + * doesn't explicitly define "ignored", i.e. doing nothing and letting + * the guest spin is technically "ignoring" the access. + */ return false; } @@ -4596,8 +4546,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .vcpu_blocking = svm_vcpu_blocking, - .vcpu_unblocking = svm_vcpu_unblocking, + .vcpu_blocking = avic_vcpu_blocking, + .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, @@ -4609,6 +4559,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .set_cr0 = svm_set_cr0, + .post_set_cr3 = svm_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, @@ -4621,12 +4572,14 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .get_if_flag = svm_get_if_flag, .tlb_flush_all = svm_flush_tlb, .tlb_flush_current = svm_flush_tlb, .tlb_flush_gva = svm_flush_tlb_gva, .tlb_flush_guest = svm_flush_tlb, + .vcpu_pre_run = svm_vcpu_pre_run, .run = svm_vcpu_run, .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, @@ -4651,7 +4604,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, - .sync_pir_to_irr = kvm_lapic_find_highest_irr, .apicv_post_state_restore = avic_post_state_restore, .set_tss_addr = svm_set_tss_addr, @@ -4681,7 +4633,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .pmu_ops = &amd_pmu_ops, .nested_ops = &svm_nested_ops, - .deliver_posted_interrupt = svm_deliver_avic_intr, + .deliver_interrupt = svm_deliver_interrupt, .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, @@ -4708,6 +4660,244 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, }; +/* + * The default MMIO mask is a single bit (excluding the present bit), + * which could conflict with the memory encryption bit. Check for + * memory encryption support and override the default MMIO mask if + * memory encryption is enabled. + */ +static __init void svm_adjust_mmio_mask(void) +{ + unsigned int enc_bit, mask_bit; + u64 msr, mask; + + /* If there is no memory encryption support, use existing mask */ + if (cpuid_eax(0x80000000) < 0x8000001f) + return; + + /* If memory encryption is not enabled, use existing mask */ + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) + return; + + enc_bit = cpuid_ebx(0x8000001f) & 0x3f; + mask_bit = boot_cpu_data.x86_phys_bits; + + /* Increment the mask bit if it is the same as the encryption bit */ + if (enc_bit == mask_bit) + mask_bit++; + + /* + * If the mask bit location is below 52, then some bits above the + * physical addressing limit will always be reserved, so use the + * rsvd_bits() function to generate the mask. This mask, along with + * the present bit, will be used to generate a page fault with + * PFER.RSV = 1. + * + * If the mask bit location is 52 (or above), then clear the mask. + */ + mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); +} + +static __init void svm_set_cpu_caps(void) +{ + kvm_set_cpu_caps(); + + supported_xss = 0; + + /* CPUID 0x80000001 and 0x8000000A (SVM features) */ + if (nested) { + kvm_cpu_cap_set(X86_FEATURE_SVM); + kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); + + if (nrips) + kvm_cpu_cap_set(X86_FEATURE_NRIPS); + + if (npt_enabled) + kvm_cpu_cap_set(X86_FEATURE_NPT); + + if (tsc_scaling) + kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); + + /* Nested VM can receive #VMEXIT instead of triggering #GP */ + kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); + } + + /* CPUID 0x80000008 */ + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || + boot_cpu_has(X86_FEATURE_AMD_SSBD)) + kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + + /* AMD PMU PERFCTR_CORE CPUID */ + if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + + /* CPUID 0x8000001F (SME/SEV features) */ + sev_set_cpu_caps(); +} + +static __init int svm_hardware_setup(void) +{ + int cpu; + struct page *iopm_pages; + void *iopm_va; + int r; + unsigned int order = get_order(IOPM_SIZE); + + /* + * NX is required for shadow paging and for NPT if the NX huge pages + * mitigation is enabled. + */ + if (!boot_cpu_has(X86_FEATURE_NX)) { + pr_err_ratelimited("NX (Execute Disable) not supported\n"); + return -EOPNOTSUPP; + } + kvm_enable_efer_bits(EFER_NX); + + iopm_pages = alloc_pages(GFP_KERNEL, order); + + if (!iopm_pages) + return -ENOMEM; + + iopm_va = page_address(iopm_pages); + memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); + iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + + init_msrpm_offsets(); + + supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + + if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) + kvm_enable_efer_bits(EFER_FFXSR); + + if (tsc_scaling) { + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + tsc_scaling = false; + } else { + pr_info("TSC scaling supported\n"); + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; + } + } + + tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); + + /* Check for pause filtering support */ + if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { + pause_filter_count = 0; + pause_filter_thresh = 0; + } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { + pause_filter_thresh = 0; + } + + if (nested) { + printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + } + + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) + npt_enabled = false; + + if (!boot_cpu_has(X86_FEATURE_NPT)) + npt_enabled = false; + + /* Force VM NPT level equal to the host's paging level */ + kvm_configure_mmu(npt_enabled, get_npt_level(), + get_npt_level(), PG_LEVEL_1G); + pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + + /* Note, SEV setup consumes npt_enabled. */ + sev_hardware_setup(); + + svm_hv_hardware_setup(); + + svm_adjust_mmio_mask(); + + for_each_possible_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } + + if (nrips) { + if (!boot_cpu_has(X86_FEATURE_NRIPS)) + nrips = false; + } + + enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); + + if (enable_apicv) { + pr_info("AVIC enabled\n"); + + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + } else { + svm_x86_ops.vcpu_blocking = NULL; + svm_x86_ops.vcpu_unblocking = NULL; + } + + if (vls) { + if (!npt_enabled || + !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || + !IS_ENABLED(CONFIG_X86_64)) { + vls = false; + } else { + pr_info("Virtual VMLOAD VMSAVE supported\n"); + } + } + + if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) + svm_gp_erratum_intercept = false; + + if (vgif) { + if (!boot_cpu_has(X86_FEATURE_VGIF)) + vgif = false; + else + pr_info("Virtual GIF supported\n"); + } + + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } + + if (!enable_pmu) + pr_info("PMU virtualization is disabled\n"); + + svm_set_cpu_caps(); + + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + + return 0; + +err: + svm_hardware_teardown(); + return r; +} + + static struct kvm_x86_init_ops svm_init_ops __initdata = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 437e68504e66..fa98d6844728 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -79,6 +79,7 @@ struct kvm_sev_info { struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ + unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; }; @@ -104,6 +105,40 @@ struct kvm_vmcb_info { uint64_t asid_generation; }; +struct vmcb_save_area_cached { + u64 efer; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; +}; + +struct vmcb_ctrl_area_cached { + u32 intercepts[MAX_INTERCEPT]; + u16 pause_filter_thresh; + u16 pause_filter_count; + u64 iopm_base_pa; + u64 msrpm_base_pa; + u64 tsc_offset; + u32 asid; + u8 tlb_ctl; + u32 int_ctl; + u32 int_vector; + u32 int_state; + u32 exit_code; + u32 exit_code_hi; + u64 exit_info_1; + u64 exit_info_2; + u32 exit_int_info; + u32 exit_int_info_err; + u64 nested_ctl; + u32 event_inj; + u32 event_inj_err; + u64 nested_cr3; + u64 virt_ext; +}; + struct svm_nested_state { struct kvm_vmcb_info vmcb02; u64 hsave_msr; @@ -119,7 +154,13 @@ struct svm_nested_state { bool nested_run_pending; /* cache for control fields of the guest */ - struct vmcb_control_area ctl; + struct vmcb_ctrl_area_cached ctl; + + /* + * Note: this struct is not kept up-to-date while L2 runs; it is only + * valid within nested_svm_vmrun. + */ + struct vmcb_save_area_cached save; bool initialized; }; @@ -184,7 +225,6 @@ struct vcpu_svm { u32 dfr_reg; struct page *avic_backing_page; u64 *avic_physical_id_cache; - bool avic_is_running; /* * Per-vcpu list of struct amd_svm_iommu_ir: @@ -247,7 +287,7 @@ static __always_inline bool sev_es_guest(struct kvm *kvm) #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - return sev_guest(kvm) && sev->es_active; + return sev->es_active && !WARN_ON_ONCE(!sev->active); #else return false; #endif @@ -264,11 +304,6 @@ static inline void vmcb_mark_all_clean(struct vmcb *vmcb) & ~VMCB_ALWAYS_DIRTY_MASK; } -static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit) -{ - return (vmcb->control.clean & (1 << bit)); -} - static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) { vmcb->control.clean &= ~(1 << bit); @@ -284,6 +319,16 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_svm, vcpu); } +/* + * Only the PDPTRs are loaded on demand into the shadow MMU. All other + * fields are synchronized in handle_exit, because accessing the VMCB is cheap. + * + * CR3 might be out of date in the VMCB but it is not marked dirty; instead, + * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3 + * is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB. + */ +#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR) + static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) { WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); @@ -302,6 +347,12 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) return test_bit(bit, (unsigned long *)&control->intercepts); } +static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit) +{ + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + return test_bit(bit, (unsigned long *)&control->intercepts); +} + static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -438,6 +489,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); +void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, + int trig_mode, int vec); /* nested.c */ @@ -454,22 +507,22 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) static inline bool nested_exit_on_smi(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); } static inline bool nested_exit_on_intr(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); } static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); -void svm_leave_nested(struct vcpu_svm *svm); +void svm_leave_nested(struct kvm_vcpu *vcpu); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct kvm_vcpu *vcpu); @@ -493,8 +546,10 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, int nested_svm_exit_special(struct vcpu_svm *svm); void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); -void nested_load_control_from_vmcb12(struct vcpu_svm *svm, - struct vmcb_control_area *control); +void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, + struct vmcb_control_area *control); +void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, + struct vmcb_save_area *save); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm); void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); @@ -503,28 +558,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ -#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) -#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 -#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) - -#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) -#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) -#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) -#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) - -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL - -static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 *entry = svm->avic_physical_id_cache; - - if (!entry) - return false; - - return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); -} - int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -541,12 +574,12 @@ bool svm_check_apicv_inhibit_reasons(ulong bit); void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec); bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); -void svm_vcpu_blocking(struct kvm_vcpu *vcpu); -void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); +void avic_vcpu_blocking(struct kvm_vcpu *vcpu); +void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); +void avic_ring_doorbell(struct kvm_vcpu *vcpu); /* sev.c */ diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index c53b8bf8d013..489ca56212c6 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -46,6 +46,9 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) hve->hv_enlightenments_control.enlightened_npt_tlb = 1; + + if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP) + hve->hv_enlightenments_control.msr_bitmap = 1; } static inline void svm_hv_hardware_setup(void) @@ -83,14 +86,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( struct hv_enlightenments *hve = (struct hv_enlightenments *)vmcb->control.reserved_sw; - /* - * vmcb can be NULL if called during early vcpu init. - * And its okay not to mark vmcb dirty during vcpu init - * as we mark it dirty unconditionally towards end of vcpu - * init phase. - */ - if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && - hve->hv_enlightenments_control.msr_bitmap) + if (hve->hv_enlightenments_control.msr_bitmap) vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); } diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 4fa17df123cd..dfaeb47fcf2a 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -148,7 +148,7 @@ SYM_FUNC_START(__svm_vcpu_run) pop %edi #endif pop %_ASM_BP - ret + RET 3: cmpb $0, kvm_rebooting jne 2b @@ -202,7 +202,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) pop %edi #endif pop %_ASM_BP - ret + RET 3: cmpb $0, kvm_rebooting jne 2b |