diff options
37 files changed, 1862 insertions, 1393 deletions
diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 0aa4817b466d..1fc860c007a3 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -38,25 +38,24 @@ the mmu-lock on x86. Currently, the page fault can be fast in one of the following two cases: 1. Access Tracking: The SPTE is not present, but it is marked for access - tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to - restore the saved R/X bits. This is described in more detail later below. + tracking. That means we need to restore the saved R/X bits. This is + described in more detail later below. -2. Write-Protection: The SPTE is present and the fault is - caused by write-protect. That means we just need to change the W bit of - the spte. +2. Write-Protection: The SPTE is present and the fault is caused by + write-protect. That means we just need to change the W bit of the spte. -What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and -SPTE_MMU_WRITEABLE bit on the spte: +What we use to avoid all the race is the Host-writable bit and MMU-writable bit +on the spte: -- SPTE_HOST_WRITEABLE means the gfn is writable on host. -- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when - the gfn is writable on guest mmu and it is not write-protected by shadow - page write-protection. +- Host-writable means the gfn is writable in the host kernel page tables and in + its KVM memslot. +- MMU-writable means the gfn is writable in the guest's mmu and it is not + write-protected by shadow page write-protection. On fast page fault path, we will use cmpxchg to atomically set the spte W -bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or -restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This -is safe because whenever changing these bits can be detected by cmpxchg. +bit if spte.HOST_WRITEABLE = 1 and spte.WRITE_PROTECT = 1, to restore the saved +R/X bits if for an access-traced spte, or both. This is safe because whenever +changing these bits can be detected by cmpxchg. But we need carefully check these cases: @@ -185,17 +184,17 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update(). Lockless Access Tracking: This is used for Intel CPUs that are using EPT but do not support the EPT A/D -bits. In this case, when the KVM MMU notifier is called to track accesses to a -page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present -by clearing the RWX bits in the PTE and storing the original R & X bits in -some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the -PTE (using the ignored bit 62). When the VM tries to access the page later on, -a fault is generated and the fast page fault mechanism described above is used -to atomically restore the PTE to a Present state. The W bit is not saved when -the PTE is marked for access tracking and during restoration to the Present -state, the W bit is set depending on whether or not it was a write access. If -it wasn't, then the W bit will remain clear until a write access happens, at -which time it will be set using the Dirty tracking mechanism described above. +bits. In this case, PTEs are tagged as A/D disabled (using ignored bits), and +when the KVM MMU notifier is called to track accesses to a page (via +kvm_mmu_notifier_clear_flush_young), it marks the PTE not-present in hardware +by clearing the RWX bits in the PTE and storing the original R & X bits in more +unused/ignored bits. When the VM tries to access the page later on, a fault is +generated and the fast page fault mechanism described above is used to +atomically restore the PTE to a Present state. The W bit is not saved when the +PTE is marked for access tracking and during restoration to the Present state, +the W bit is set depending on whether or not it was a write access. If it +wasn't, then the W bit will remain clear until a write access happens, at which +time it will be set using the Dirty tracking mechanism described above. 3. Reference ------------ diff --git a/Documentation/virt/kvm/s390-diag.rst b/Documentation/virt/kvm/s390-diag.rst index eaac4864d3d6..ca85f030eb0b 100644 --- a/Documentation/virt/kvm/s390-diag.rst +++ b/Documentation/virt/kvm/s390-diag.rst @@ -84,3 +84,36 @@ If the function code specifies 0x501, breakpoint functions may be performed. This function code is handled by userspace. This diagnose function code has no subfunctions and uses no parameters. + + +DIAGNOSE function code 'X'9C - Voluntary Time Slice Yield +--------------------------------------------------------- + +General register 1 contains the target CPU address. + +In a guest of a hypervisor like LPAR, KVM or z/VM using shared host CPUs, +DIAGNOSE with function code 0x9c may improve system performance by +yielding the host CPU on which the guest CPU is running to be assigned +to another guest CPU, preferably the logical CPU containing the specified +target CPU. + + +DIAG 'X'9C forwarding ++++++++++++++++++++++ + +The guest may send a DIAGNOSE 0x9c in order to yield to a certain +other vcpu. An example is a Linux guest that tries to yield to the vcpu +that is currently holding a spinlock, but not running. + +However, on the host the real cpu backing the vcpu may itself not be +running. +Forwarding the DIAGNOSE 0x9c initially sent by the guest to yield to +the backing cpu will hopefully cause that cpu, and thus subsequently +the guest's vcpu, to be scheduled. + + +diag9c_forwarding_hz + KVM kernel parameter allowing to specify the maximum number of DIAGNOSE + 0x9c forwarding per second in the purpose of avoiding a DIAGNOSE 0x9c + forwarding storm. + A value of 0 turns the forwarding off. diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 6bcfc5614bbc..0af3e032a49d 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -454,6 +454,7 @@ struct kvm_vcpu_stat { u64 diagnose_44; u64 diagnose_9c; u64 diagnose_9c_ignored; + u64 diagnose_9c_forward; u64 diagnose_258; u64 diagnose_308; u64 diagnose_500; diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 01e360004481..e317fd4866c1 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -63,5 +63,6 @@ extern void __noreturn cpu_die(void); extern void __cpu_die(unsigned int cpu); extern int __cpu_disable(void); extern void schedule_mcck_handler(void); +void notrace smp_yield_cpu(int cpu); #endif /* __ASM_SMP_H */ diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 58c8afa3da65..2fec2b80d35d 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -429,6 +429,7 @@ void notrace smp_yield_cpu(int cpu) asm volatile("diag %0,0,0x9c" : : "d" (pcpu_devices[cpu].address)); } +EXPORT_SYMBOL_GPL(smp_yield_cpu); /* * Send cpus emergency shutdown signal. This gives the cpus the diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 5b8ec1c447e1..02c146f9e5cd 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -150,6 +150,19 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu) return 0; } +static int forward_cnt; +static unsigned long cur_slice; + +static int diag9c_forwarding_overrun(void) +{ + /* Reset the count on a new slice */ + if (time_after(jiffies, cur_slice)) { + cur_slice = jiffies; + forward_cnt = diag9c_forwarding_hz / HZ; + } + return forward_cnt-- <= 0 ? 1 : 0; +} + static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) { struct kvm_vcpu *tcpu; @@ -167,9 +180,21 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) if (!tcpu) goto no_yield; - /* target already running */ - if (READ_ONCE(tcpu->cpu) >= 0) - goto no_yield; + /* target guest VCPU already running */ + if (READ_ONCE(tcpu->cpu) >= 0) { + if (!diag9c_forwarding_hz || diag9c_forwarding_overrun()) + goto no_yield; + + /* target host CPU already running */ + if (!vcpu_is_preempted(tcpu->cpu)) + goto no_yield; + smp_yield_cpu(tcpu->cpu); + VCPU_EVENT(vcpu, 5, + "diag time slice end directed to %d: yield forwarded", + tid); + vcpu->stat.diagnose_9c_forward++; + return 0; + } if (kvm_vcpu_yield_to(tcpu) <= 0) goto no_yield; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 6d6b57059493..b9f85b2dc053 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -976,7 +976,9 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) * kvm_s390_shadow_tables - walk the guest page table and create shadow tables * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap - * @pgt: pointer to the page table address result + * @pgt: pointer to the beginning of the page table for the given address if + * successful (return value 0), or to the first invalid DAT entry in + * case of exceptions (return value > 0) * @fake: pgt references contiguous guest memory block, not a pgtable */ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, @@ -1034,6 +1036,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, rfte.val = ptr; goto shadow_r2t; } + *pgt = ptr + vaddr.rfx * 8; rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); if (rc) return rc; @@ -1060,6 +1063,7 @@ shadow_r2t: rste.val = ptr; goto shadow_r3t; } + *pgt = ptr + vaddr.rsx * 8; rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); if (rc) return rc; @@ -1087,6 +1091,7 @@ shadow_r3t: rtte.val = ptr; goto shadow_sgt; } + *pgt = ptr + vaddr.rtx * 8; rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); if (rc) return rc; @@ -1123,6 +1128,7 @@ shadow_sgt: ste.val = ptr; goto shadow_pgt; } + *pgt = ptr + vaddr.sx * 8; rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); if (rc) return rc; @@ -1157,6 +1163,8 @@ shadow_pgt: * @vcpu: virtual cpu * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap + * @datptr: will contain the address of the faulting DAT table entry, or of + * the valid leaf, plus some flags * * Returns: - 0 if the shadow fault was successfully resolved * - > 0 (pgm exception code) on exceptions while faulting @@ -1165,11 +1173,11 @@ shadow_pgt: * - -ENOMEM if out of memory */ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, - unsigned long saddr) + unsigned long saddr, unsigned long *datptr) { union vaddress vaddr; union page_table_entry pte; - unsigned long pgt; + unsigned long pgt = 0; int dat_protection, fake; int rc; @@ -1191,8 +1199,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, pte.val = pgt + vaddr.px * PAGE_SIZE; goto shadow_page; } - if (!rc) - rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); + + switch (rc) { + case PGM_SEGMENT_TRANSLATION: + case PGM_REGION_THIRD_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_FIRST_TRANS: + pgt |= PEI_NOT_PTE; + break; + case 0: + pgt += vaddr.px * 8; + rc = gmap_read_table(sg->parent, pgt, &pte.val); + } + if (datptr) + *datptr = pgt | dat_protection * PEI_DAT_PROT; if (!rc && pte.i) rc = PGM_PAGE_TRANSLATION; if (!rc && pte.z) diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index f4c51756c462..7c72a5e3449f 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -18,17 +18,14 @@ /** * kvm_s390_real_to_abs - convert guest real address to guest absolute address - * @vcpu - guest virtual cpu + * @prefix - guest prefix * @gra - guest real address * * Returns the guest absolute address that corresponds to the passed guest real - * address @gra of a virtual guest cpu by applying its prefix. + * address @gra of by applying the given prefix. */ -static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, - unsigned long gra) +static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra) { - unsigned long prefix = kvm_s390_get_prefix(vcpu); - if (gra < 2 * PAGE_SIZE) gra += prefix; else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE) @@ -37,6 +34,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, } /** + * kvm_s390_real_to_abs - convert guest real address to guest absolute address + * @vcpu - guest virtual cpu + * @gra - guest real address + * + * Returns the guest absolute address that corresponds to the passed guest real + * address @gra of a virtual guest cpu by applying its prefix. + */ +static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, + unsigned long gra) +{ + return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra); +} + +/** + * _kvm_s390_logical_to_effective - convert guest logical to effective address + * @psw: psw of the guest + * @ga: guest logical address + * + * Convert a guest logical address to an effective address by applying the + * rules of the addressing mode defined by bits 31 and 32 of the given PSW + * (extendended/basic addressing mode). + * + * Depending on the addressing mode, the upper 40 bits (24 bit addressing + * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing + * mode) of @ga will be zeroed and the remaining bits will be returned. + */ +static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw, + unsigned long ga) +{ + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT) + return ga; + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT) + return ga & ((1UL << 31) - 1); + return ga & ((1UL << 24) - 1); +} + +/** * kvm_s390_logical_to_effective - convert guest logical to effective address * @vcpu: guest virtual cpu * @ga: guest logical address @@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu, unsigned long ga) { - psw_t *psw = &vcpu->arch.sie_block->gpsw; - - if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT) - return ga; - if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT) - return ga & ((1UL << 31) - 1); - return ga & ((1UL << 24) - 1); + return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga); } /* @@ -359,7 +387,11 @@ void ipte_unlock(struct kvm_vcpu *vcpu); int ipte_lock_held(struct kvm_vcpu *vcpu); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); +/* MVPG PEI indication bits */ +#define PEI_DAT_PROT 2 +#define PEI_NOT_PTE 4 + int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, - unsigned long saddr); + unsigned long saddr, unsigned long *datptr); #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2f09e9d7dc95..95ef9193f12e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -158,6 +158,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("instruction_diag_44", diagnose_44), VCPU_STAT("instruction_diag_9c", diagnose_9c), VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored), + VCPU_STAT("diag_9c_forward", diagnose_9c_forward), VCPU_STAT("instruction_diag_258", diagnose_258), VCPU_STAT("instruction_diag_308", diagnose_308), VCPU_STAT("instruction_diag_500", diagnose_500), @@ -185,6 +186,11 @@ static bool use_gisa = true; module_param(use_gisa, bool, 0644); MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it."); +/* maximum diag9c forwarding per second */ +unsigned int diag9c_forwarding_hz; +module_param(diag9c_forwarding_hz, uint, 0644); +MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off"); + /* * For now we handle at most 16 double words as this is what the s390 base * kernel handles and stores in the prefix page. If we ever need to go beyond @@ -4307,16 +4313,16 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu) kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val; if (MACHINE_HAS_GS) { + preempt_disable(); __ctl_set_bit(2, 4); if (vcpu->arch.gs_enabled) save_gs_cb(current->thread.gs_cb); - preempt_disable(); current->thread.gs_cb = vcpu->arch.host_gscb; restore_gs_cb(vcpu->arch.host_gscb); - preempt_enable(); if (!vcpu->arch.host_gscb) __ctl_clear_bit(2, 4); vcpu->arch.host_gscb = NULL; + preempt_enable(); } /* SIE will save etoken directly into SDNX and therefore kvm_run */ } @@ -4542,7 +4548,7 @@ int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) /* * As we are starting a second VCPU, we have to disable * the IBS facility on all VCPUs to remove potentially - * oustanding ENABLE requests. + * outstanding ENABLE requests. */ __disable_ibs_on_all_vcpus(vcpu->kvm); } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 79dcd647b378..9fad25109b0d 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -471,4 +471,12 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, * @kvm: the KVM guest */ void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm); + +/** + * diag9c_forwarding_hz + * + * Set the maximum number of diag9c forwarding per second + */ +extern unsigned int diag9c_forwarding_hz; + #endif diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index bd803e091918..4002a24bc43a 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -417,11 +417,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) memcpy((void *)((u64)scb_o + 0xc0), (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0); break; - case ICPT_PARTEXEC: - /* MVPG only */ - memcpy((void *)((u64)scb_o + 0xc0), - (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0); - break; } if (scb_s->ihcpu != 0xffffU) @@ -620,10 +615,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* with mso/msl, the prefix lies at offset *mso* */ prefix += scb_s->mso; - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix); + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL); if (!rc && (scb_s->ecb & ECB_TE)) rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - prefix + PAGE_SIZE); + prefix + PAGE_SIZE, NULL); /* * We don't have to mprotect, we will be called for all unshadows. * SIE will detect if protection applies and trigger a validity. @@ -914,7 +909,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) current->thread.gmap_addr, 1); rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - current->thread.gmap_addr); + current->thread.gmap_addr, NULL); if (rc > 0) { rc = inject_fault(vcpu, rc, current->thread.gmap_addr, @@ -936,7 +931,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu, { if (vsie_page->fault_addr) kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - vsie_page->fault_addr); + vsie_page->fault_addr, NULL); vsie_page->fault_addr = 0; } @@ -984,6 +979,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* + * Get a register for a nested guest. + * @vcpu the vcpu of the guest + * @vsie_page the vsie_page for the nested guest + * @reg the register number, the upper 4 bits are ignored. + * returns: the value of the register. + */ +static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg) +{ + /* no need to validate the parameter and/or perform error handling */ + reg &= 0xf; + switch (reg) { + case 15: + return vsie_page->scb_s.gg15; + case 14: + return vsie_page->scb_s.gg14; + default: + return vcpu->run->s.regs.gprs[reg]; + } +} + +static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + unsigned long pei_dest, pei_src, src, dest, mask, prefix; + u64 *pei_block = &vsie_page->scb_o->mcic; + int edat, rc_dest, rc_src; + union ctlreg0 cr0; + + cr0.val = vcpu->arch.sie_block->gcr[0]; + edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); + mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK); + prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; + + dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask; + dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso; + src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask; + src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso; + + rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest); + rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src); + /* + * Either everything went well, or something non-critical went wrong + * e.g. because of a race. In either case, simply retry. + */ + if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) { + retry_vsie_icpt(vsie_page); + return -EAGAIN; + } + /* Something more serious went wrong, propagate the error */ + if (rc_dest < 0) + return rc_dest; + if (rc_src < 0) + return rc_src; + + /* The only possible suppressing exception: just deliver it */ + if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) { + clear_vsie_icpt(vsie_page); + rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC); + WARN_ON_ONCE(rc_dest); + return 1; + } + + /* + * Forward the PEI intercept to the guest if it was a page fault, or + * also for segment and region table faults if EDAT applies. + */ + if (edat) { + rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0; + rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0; + } else { + rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0; + rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0; + } + if (!rc_dest && !rc_src) { + pei_block[0] = pei_dest; + pei_block[1] = pei_src; + return 1; + } + + retry_vsie_icpt(vsie_page); + + /* + * The host has edat, and the guest does not, or it was an ASCE type + * exception. The host needs to inject the appropriate DAT interrupts + * into the guest. + */ + if (rc_dest) + return inject_fault(vcpu, rc_dest, dest, 1); + return inject_fault(vcpu, rc_src, src, 0); +} + +/* * Run the vsie on a shadow scb and a shadow gmap, without any further * sanity checks, handling SIE faults. * @@ -1071,6 +1158,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if ((scb_s->ipa & 0xf000) != 0xf000) scb_s->ipa += 0x1000; break; + case ICPT_PARTEXEC: + if (scb_s->ipa == 0xb254) + rc = vsie_handle_mvpg(vcpu, vsie_page); + break; } return rc; } diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1f918f5e0055..dddc746b5455 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -338,6 +338,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9bc091ecaaeb..a52f973bdff6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1125,6 +1125,7 @@ struct kvm_vcpu_stat { u64 req_event; u64 halt_poll_success_ns; u64 halt_poll_fail_ns; + u64 nested_run; }; struct x86_instruction_info; @@ -1251,8 +1252,8 @@ struct kvm_x86_ops { int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd, - int pgd_level); + void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int root_level); bool (*has_wbinvd_exit)(void); @@ -1339,6 +1340,7 @@ struct kvm_x86_ops { struct kvm_x86_nested_ops { int (*check_events)(struct kvm_vcpu *vcpu); bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); + void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, unsigned user_data_size); @@ -1410,9 +1412,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1520,6 +1519,11 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); +int kvm_emulate_as_nop(struct kvm_vcpu *vcpu); +int kvm_emulate_invd(struct kvm_vcpu *vcpu); +int kvm_emulate_mwait(struct kvm_vcpu *vcpu); +int kvm_handle_invalid_op(struct kvm_vcpu *vcpu); +int kvm_emulate_monitor(struct kvm_vcpu *vcpu); int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); @@ -1548,14 +1552,14 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); -int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); +int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); -bool kvm_rdpmc(struct kvm_vcpu *vcpu); +int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); @@ -1596,9 +1600,6 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); -int kvm_mmu_load(struct kvm_vcpu *vcpu); -void kvm_mmu_unload(struct kvm_vcpu *vcpu); -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, ulong roots_to_free); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1c561945b426..772e60efe243 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -269,7 +269,9 @@ struct vmcb_save_area { * SEV-ES guests when referenced through the GHCB or for * saving to the host save area. */ - u8 reserved_7[80]; + u8 reserved_7[72]; + u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ + u8 reserved_7b[4]; u32 pkru; u8 reserved_7a[20]; u64 reserved_8; /* rax already available at 0x01f8 */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cc369b9ad8f1..0050f39e90d9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2869,7 +2869,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) return; if (is_guest_mode(vcpu)) { - r = kvm_x86_ops.nested_ops->check_events(vcpu); + r = kvm_check_nested_events(vcpu); if (r < 0) return; /* diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c68bfc3e2402..88d0ed5225a4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -59,7 +59,8 @@ static __always_inline u64 rsvd_bits(int s, int e) return ((2ULL << (e - s)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); +void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); @@ -73,6 +74,10 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); +int kvm_mmu_load(struct kvm_vcpu *vcpu); +void kvm_mmu_unload(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); + static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE)) @@ -102,8 +107,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) if (!VALID_PAGE(root_hpa)) return; - static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu), - vcpu->arch.mmu->shadow_root_level); + static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa, + vcpu->arch.mmu->shadow_root_level); } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, @@ -124,7 +129,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * write-protects guest page to sync the guest modification, b) another one is * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences * between these two sorts are: - * 1) the first case clears SPTE_MMU_WRITEABLE bit. + * 1) the first case clears MMU-writable bit. * 2) the first case requires flushing tlb immediately avoiding corrupting * shadow page table between all vcpus so it should be in the protection of * mmu-lock. And the another case does not need to flush tlb until returning @@ -135,17 +140,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * So, there is the problem: the first case can meet the corrupted tlb caused * by another case which write-protects pages but without flush tlb * immediately. In order to making the first case be aware this problem we let - * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit - * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit. + * it flush tlb if we try to write-protect a spte whose MMU-writable bit + * is set, it works since another case never touches MMU-writable bit. * * Anyway, whenever a spte is updated (only permission and status bits are - * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes + * changed) we need to check whether the spte with MMU-writable becomes * readonly, if that happens, we need to flush tlb. Fortunately, * mmu_spte_update() has already handled it perfectly. * - * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK: + * The rules to use MMU-writable and PT_WRITABLE_MASK: * - if we want to see if it has writable tlb entry or if the spte can be - * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most + * writable on the mmu mapping, check MMU-writable, this is the most * case, otherwise * - if we fix page fault on the spte or do write-protection by dirty logging, * check PT_WRITABLE_MASK. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d75524bc8423..7a99e59c8c1c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -48,6 +48,7 @@ #include <asm/memtype.h> #include <asm/cmpxchg.h> #include <asm/io.h> +#include <asm/set_memory.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> #include "trace.h" @@ -215,10 +216,10 @@ bool is_nx_huge_page_enabled(void) static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { - u64 mask = make_mmio_spte(vcpu, gfn, access); + u64 spte = make_mmio_spte(vcpu, gfn, access); - trace_mark_mmio_spte(sptep, gfn, mask); - mmu_spte_set(sptep, mask); + trace_mark_mmio_spte(sptep, gfn, spte); + mmu_spte_set(sptep, spte); } static gfn_t get_mmio_spte_gfn(u64 spte) @@ -236,17 +237,6 @@ static unsigned get_mmio_spte_access(u64 spte) return spte & shadow_mmio_access_mask; } -static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - kvm_pfn_t pfn, unsigned int access) -{ - if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(vcpu, sptep, gfn, access); - return true; - } - - return false; -} - static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { u64 kvm_gen, spte_gen, gen; @@ -1118,7 +1108,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) rmap_printk("spte %p %llx\n", sptep, *sptep); if (pt_protect) - spte &= ~SPTE_MMU_WRITEABLE; + spte &= ~shadow_mmu_writable_mask; spte = spte & ~PT_WRITABLE_MASK; return mmu_spte_update(sptep, spte); @@ -1424,17 +1414,15 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) -static __always_inline int -kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, - gfn_t gfn, - int level, - unsigned long data)) +typedef int (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, unsigned long data); + +static __always_inline int kvm_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + rmap_handler_t handler) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1473,12 +1461,7 @@ kvm_handle_hva_range(struct kvm *kvm, } static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, - gfn_t gfn, int level, - unsigned long data)) + unsigned long data, rmap_handler_t handler) { return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); } @@ -2421,6 +2404,15 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); + /* + * Note, this check is intentionally soft, it only guarantees that one + * page is available, while the caller may end up allocating as many as + * four pages, e.g. for PAE roots or for 5-level paging. Temporarily + * exceeding the (arbitrary by default) limit will not harm the host, + * being too agressive may unnecessarily kill the guest, and getting an + * exact count is far more trouble than it's worth, especially in the + * page fault paths. + */ if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; return 0; @@ -2561,9 +2553,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp; int ret; - if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) - return 0; - sp = sptep_to_sp(sptep); ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, @@ -2593,6 +2582,11 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); + if (unlikely(is_noslot_pfn(pfn))) { + mark_mmio_spte(vcpu, sptep, gfn, pte_access); + return RET_PF_EMULATE; + } + if (is_shadow_present_pte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink @@ -2626,9 +2620,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, KVM_PAGES_PER_HPAGE(level)); - if (unlikely(is_mmio_spte(*sptep))) - ret = RET_PF_EMULATE; - /* * The fault is fully spurious if and only if the new SPTE and old SPTE * are identical, and emulation is not required. @@ -2946,9 +2937,19 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, return true; } - if (unlikely(is_noslot_pfn(pfn))) + if (unlikely(is_noslot_pfn(pfn))) { vcpu_cache_mmio_info(vcpu, gva, gfn, access & shadow_mmio_access_mask); + /* + * If MMIO caching is disabled, emulate immediately without + * touching the shadow page tables as attempting to install an + * MMIO SPTE will just be an expensive nop. + */ + if (unlikely(!shadow_mmio_value)) { + *ret_val = RET_PF_EMULATE; + return true; + } + } return false; } @@ -3061,6 +3062,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!is_shadow_present_pte(spte)) break; + if (!is_shadow_present_pte(spte)) + break; + sp = sptep_to_sp(iterator.sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3193,14 +3197,17 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); - } else { - for (i = 0; i < 4; ++i) - if (mmu->pae_root[i] != 0) - mmu_free_root_page(kvm, - &mmu->pae_root[i], - &invalid_list); - mmu->root_hpa = INVALID_PAGE; + } else if (mmu->pae_root) { + for (i = 0; i < 4; ++i) { + if (!IS_VALID_PAE_ROOT(mmu->pae_root[i])) + continue; + + mmu_free_root_page(kvm, &mmu->pae_root[i], + &invalid_list); + mmu->pae_root[i] = INVALID_PAE_ROOT; + } } + mmu->root_hpa = INVALID_PAGE; mmu->root_pgd = 0; } @@ -3226,155 +3233,182 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, { struct kvm_mmu_page *sp; - write_lock(&vcpu->kvm->mmu_lock); - - if (make_mmu_pages_available(vcpu)) { - write_unlock(&vcpu->kvm->mmu_lock); - return INVALID_PAGE; - } sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); ++sp->root_count; - write_unlock(&vcpu->kvm->mmu_lock); return __pa(sp->spt); } static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) { - u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level; + struct kvm_mmu *mmu = vcpu->arch.mmu; + u8 shadow_root_level = mmu->shadow_root_level; hpa_t root; unsigned i; if (is_tdp_mmu_enabled(vcpu->kvm)) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); - - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + mmu->root_hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, - true); - - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + mmu->root_hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { + if (WARN_ON_ONCE(!mmu->pae_root)) + return -EIO; + for (i = 0; i < 4; ++i) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); + WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, true); - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; + mmu->pae_root[i] = root | PT_PRESENT_MASK | + shadow_me_mask; } - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); - } else - BUG(); + mmu->root_hpa = __pa(mmu->pae_root); + } else { + WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); + return -EIO; + } /* root_pgd is ignored for direct MMUs. */ - vcpu->arch.mmu->root_pgd = 0; + mmu->root_pgd = 0; return 0; } static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { - u64 pdptr, pm_mask; + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; hpa_t root; int i; - root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu); + root_pgd = mmu->get_guest_pgd(vcpu); root_gfn = root_pgd >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) return 1; + if (mmu->root_level == PT32E_ROOT_LEVEL) { + for (i = 0; i < 4; ++i) { + pdptrs[i] = mmu->get_pdptr(vcpu, i); + if (!(pdptrs[i] & PT_PRESENT_MASK)) + continue; + + if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) + return 1; + } + } + /* * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa)); - + if (mmu->root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, - vcpu->arch.mmu->shadow_root_level, false); - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + mmu->shadow_root_level, false); + mmu->root_hpa = root; goto set_root_pgd; } + if (WARN_ON_ONCE(!mmu->pae_root)) + return -EIO; + /* * We shadow a 32 bit page table. This may be a legacy 2-level * or a PAE 3-level page table. In either case we need to be aware that * the shadow page table may be a PAE or a long mode page table. */ - pm_mask = PT_PRESENT_MASK; - if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) + pm_mask = PT_PRESENT_MASK | shadow_me_mask; + if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; + if (WARN_ON_ONCE(!mmu->lm_root)) + return -EIO; + + mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask; + } + for (i = 0; i < 4; ++i) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); - if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { - pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); - if (!(pdptr & PT_PRESENT_MASK)) { - vcpu->arch.mmu->pae_root[i] = 0; + WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); + + if (mmu->root_level == PT32E_ROOT_LEVEL) { + if (!(pdptrs[i] & PT_PRESENT_MASK)) { + mmu->pae_root[i] = INVALID_PAE_ROOT; continue; } - root_gfn = pdptr >> PAGE_SHIFT; - if (mmu_check_root(vcpu, root_gfn)) - return 1; + root_gfn = pdptrs[i] >> PAGE_SHIFT; } root = mmu_alloc_root(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, false); - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->pae_root[i] = root | pm_mask; + mmu->pae_root[i] = root | pm_mask; } - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); + + if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) + mmu->root_hpa = __pa(mmu->lm_root); + else + mmu->root_hpa = __pa(mmu->pae_root); + +set_root_pgd: + mmu->root_pgd = root_pgd; + + return 0; +} + +static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 *lm_root, *pae_root; /* - * If we shadow a 32 bit page table with a long mode page - * table we enter this path. + * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP + * tables are allocated and initialized at root creation as there is no + * equivalent level in the guest's NPT to shadow. Allocate the tables + * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. */ - if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) { - if (vcpu->arch.mmu->lm_root == NULL) { - /* - * The additional page necessary for this is only - * allocated on demand. - */ + if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL || + mmu->shadow_root_level < PT64_ROOT_4LEVEL) + return 0; - u64 *lm_root; + /* + * This mess only works with 4-level paging and needs to be updated to + * work with 5-level paging. + */ + if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) + return -EIO; - lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (lm_root == NULL) - return 1; + if (mmu->pae_root && mmu->lm_root) + return 0; - lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask; + /* + * The special roots should always be allocated in concert. Yell and + * bail if KVM ends up in a state where only one of the roots is valid. + */ + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root)) + return -EIO; - vcpu->arch.mmu->lm_root = lm_root; - } + /* + * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and + * doesn't need to be decrypted. + */ + pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pae_root) + return -ENOMEM; - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root); + lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!lm_root) { + free_page((unsigned long)pae_root); + return -ENOMEM; } -set_root_pgd: - vcpu->arch.mmu->root_pgd = root_pgd; + mmu->pae_root = pae_root; + mmu->lm_root = lm_root; return 0; } -static int mmu_alloc_roots(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.mmu->direct_map) - return mmu_alloc_direct_roots(vcpu); - else - return mmu_alloc_shadow_roots(vcpu); -} - void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; @@ -3422,7 +3456,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu->pae_root[i]; - if (root && VALID_PAGE(root)) { + if (IS_VALID_PAE_ROOT(root)) { root &= PT64_BASE_ADDR_MASK; sp = to_shadow_page(root); mmu_sync_children(vcpu, sp); @@ -3554,11 +3588,12 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) __is_rsvd_bits_set(rsvd_check, sptes[level], level); if (reserved) { - pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", + pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", __func__, addr); for (level = root; level >= leaf; level--) - pr_err("------ spte 0x%llx level %d.\n", - sptes[level], level); + pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", + sptes[level], level, + rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]); } return reserved; @@ -3653,6 +3688,14 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; + /* + * Retry the page fault if the gfn hit a memslot that is being deleted + * or moved. This ensures any existing SPTEs for the old memslot will + * be zapped before KVM inserts a new MMIO SPTE for the gfn. + */ + if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) + return true; + /* Don't expose private memslots to L2. */ if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { *pfn = KVM_PFN_NOSLOT; @@ -4615,12 +4658,17 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, struct kvm_mmu *context = &vcpu->arch.guest_mmu; union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); - context->shadow_root_level = new_role.base.level; - __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); - if (new_role.as_u64 != context->mmu_role.as_u64) + if (new_role.as_u64 != context->mmu_role.as_u64) { shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); + + /* + * Override the level set by the common init helper, nested TDP + * always uses the host's TDP configuration. + */ + context->shadow_root_level = new_role.base.level; + } } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); @@ -4802,16 +4850,27 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); if (r) goto out; - r = mmu_alloc_roots(vcpu); - kvm_mmu_sync_roots(vcpu); + r = mmu_alloc_special_roots(vcpu); if (r) goto out; + write_lock(&vcpu->kvm->mmu_lock); + if (make_mmu_pages_available(vcpu)) + r = -ENOSPC; + else if (vcpu->arch.mmu->direct_map) + r = mmu_alloc_direct_roots(vcpu); + else + r = mmu_alloc_shadow_roots(vcpu); + write_unlock(&vcpu->kvm->mmu_lock); + if (r) + goto out; + + kvm_mmu_sync_roots(vcpu); + kvm_mmu_load_pgd(vcpu); static_call(kvm_x86_tlb_flush_current)(vcpu); out: return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { @@ -4820,7 +4879,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa)); } -EXPORT_SYMBOL_GPL(kvm_mmu_unload); static bool need_remote_flush(u64 old, u64 new) { @@ -5220,6 +5278,8 @@ slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, static void free_mmu_pages(struct kvm_mmu *mmu) { + if (!tdp_enabled && mmu->pae_root) + set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); free_page((unsigned long)mmu->lm_root); } @@ -5240,9 +5300,11 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * while the PDP table is a per-vCPU construct that's allocated at MMU * creation. When emulating 32-bit mode, cr3 is only 32 bits even on * x86_64. Therefore we need to allocate the PDP table in the first - * 4GB of memory, which happens to fit the DMA32 zone. Except for - * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can - * skip allocating the PDP table. + * 4GB of memory, which happens to fit the DMA32 zone. TDP paging + * generally doesn't use PAE paging and can skip allocating the PDP + * table. The main exception, handled here, is SVM's 32-bit NPT. The + * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit + * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots(). */ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; @@ -5252,8 +5314,22 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) return -ENOMEM; mmu->pae_root = page_address(page); + + /* + * CR3 is only 32 bits when PAE paging is used, thus it's impossible to + * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so + * that KVM's writes and the CPU's reads get along. Note, this is + * only necessary when using shadow paging, as 64-bit NPT can get at + * the C-bit even when shadowing 32-bit NPT, and SME isn't supported + * by 32-bit kernels (when KVM itself uses 32-bit NPT). + */ + if (!tdp_enabled) + set_memory_decrypted((unsigned long)mmu->pae_root, 1); + else + WARN_ON_ONCE(shadow_me_mask); + for (i = 0; i < 4; ++i) - mmu->pae_root[i] = INVALID_PAGE; + mmu->pae_root[i] = INVALID_PAE_ROOT; return 0; } @@ -5476,9 +5552,9 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * spte from present to present (changing the spte from present * to nonpresent will flush all the TLBs immediately), in other * words, the only case we care is mmu_spte_update() where we - * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE - * instead of PT_WRITABLE_MASK, that means it does not depend - * on PT_WRITABLE_MASK anymore. + * have checked Host-writable | MMU-writable instead of + * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK + * anymore. */ if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); @@ -5701,25 +5777,6 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } -static void kvm_set_mmio_spte_mask(void) -{ - u64 mask; - - /* - * Set a reserved PA bit in MMIO SPTEs to generate page faults with - * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT - * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports - * 52-bit physical addresses then there are no reserved PA bits in the - * PTEs and so the reserved PA approach must be disabled. - */ - if (shadow_phys_bits < 52) - mask = BIT_ULL(51) | PT_PRESENT_MASK; - else - mask = 0; - - kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK); -} - static bool get_nx_auto_mode(void) { /* Return true when CPU has the bug, and mitigations are ON */ @@ -5785,8 +5842,6 @@ int kvm_mmu_module_init(void) kvm_mmu_reset_all_pte_masks(); - kvm_set_mmio_spte_mask(); - pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); @@ -5884,6 +5939,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page *sp; unsigned int ratio; LIST_HEAD(invalid_list); + bool flush = false; ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); @@ -5905,19 +5961,19 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); if (is_tdp_mmu_page(sp)) { - kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, - sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); + flush = kvm_tdp_mmu_zap_sp(kvm, sp); } else { kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->lpage_disallowed); } if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); cond_resched_rwlock_write(&kvm->mmu_lock); + flush = false; } } - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index ced15fd58fde..cedc17b2f60e 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -70,7 +70,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu->pae_root[i]; - if (root && VALID_PAGE(root)) { + if (IS_VALID_PAE_ROOT(root)) { root &= PT64_BASE_ADDR_MASK; sp = to_shadow_page(root); __mmu_spte_walk(vcpu, sp, fn, 2); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ec4fc28b325a..e03267e93459 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -20,6 +20,16 @@ extern bool dbg; #define MMU_WARN_ON(x) do { } while (0) #endif +/* + * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT + * bit, and thus are guaranteed to be non-zero when valid. And, when a guest + * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE, + * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use + * '0' instead of INVALID_PAGE to indicate an invalid PAE root. + */ +#define INVALID_PAE_ROOT 0 +#define IS_VALID_PAE_ROOT(x) (!!(x)) + struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; @@ -78,6 +88,11 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return sp->role.smm ? 1 : 0; +} + static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) { /* diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 55d7b473ac44..70b7e44e3035 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -503,6 +503,7 @@ error: #endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; + walker->fault.async_page_fault = false; trace_kvm_mmu_walker_error(walker->fault.error_code); return 0; @@ -1084,7 +1085,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; - host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; + host_writable = sp->spt[i] & shadow_host_writable_mask; set_spte_ret |= set_spte(vcpu, &sp->spt[i], pte_access, PG_LEVEL_4K, diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index ef55f0bc4ccf..66d43cec0c31 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -16,13 +16,20 @@ #include "spte.h" #include <asm/e820/api.h> +#include <asm/vmx.h> +static bool __read_mostly enable_mmio_caching = true; +module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); + +u64 __read_mostly shadow_host_writable_mask; +u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ u64 __read_mostly shadow_user_mask; u64 __read_mostly shadow_accessed_mask; u64 __read_mostly shadow_dirty_mask; u64 __read_mostly shadow_mmio_value; +u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; u64 __read_mostly shadow_me_mask; @@ -38,7 +45,6 @@ static u64 generation_mmio_spte_mask(u64 gen) u64 mask; WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; @@ -48,16 +54,18 @@ static u64 generation_mmio_spte_mask(u64 gen) u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) { u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; - u64 mask = generation_mmio_spte_mask(gen); + u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; + WARN_ON_ONCE(!shadow_mmio_value); + access &= shadow_mmio_access_mask; - mask |= shadow_mmio_value | access; - mask |= gpa | shadow_nonpresent_or_rsvd_mask; - mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + spte |= shadow_mmio_value | access; + spte |= gpa | shadow_nonpresent_or_rsvd_mask; + spte |= (gpa & shadow_nonpresent_or_rsvd_mask) << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; - return mask; + return spte; } static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) @@ -86,13 +94,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, bool can_unsync, bool host_writable, bool ad_disabled, u64 *new_spte) { - u64 spte = 0; + u64 spte = SPTE_MMU_PRESENT_MASK; int ret = 0; if (ad_disabled) - spte |= SPTE_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) - spte |= SPTE_AD_WRPROT_ONLY_MASK; + spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; + + /* + * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set + * if PAE paging may be employed (shadow paging or any 32-bit KVM). + */ + WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) && + (spte & SPTE_TDP_AD_MASK)); /* * For the EPT case, shadow_present_mask is 0 if hardware @@ -124,7 +139,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, kvm_is_mmio_pfn(pfn)); if (host_writable) - spte |= SPTE_HOST_WRITEABLE; + spte |= shadow_host_writable_mask; else pte_access &= ~ACC_WRITE_MASK; @@ -134,7 +149,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; + spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask; /* * Optimization: for pte sync, if spte was writable the hash @@ -150,7 +165,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, __func__, gfn); ret |= SET_SPTE_WRITE_PROTECTED_PT; pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); } } @@ -161,19 +176,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte = mark_spte_for_access_track(spte); out: + WARN_ON(is_mmio_spte(spte)); *new_spte = spte; return ret; } u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { - u64 spte; + u64 spte = SPTE_MMU_PRESENT_MASK; - spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; + spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_me_mask; if (ad_disabled) - spte |= SPTE_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED_MASK; else spte |= shadow_accessed_mask; @@ -188,7 +204,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; + new_spte &= ~shadow_host_writable_mask; new_spte = mark_spte_for_access_track(new_spte); @@ -242,53 +258,68 @@ u64 mark_spte_for_access_track(u64 spte) return spte; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); - WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)); WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); - shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; + + if (!enable_mmio_caching) + mmio_value = 0; + + /* + * Disable MMIO caching if the MMIO value collides with the bits that + * are used to hold the relocated GFN when the L1TF mitigation is + * enabled. This should never fire as there is no known hardware that + * can trigger this condition, e.g. SME/SEV CPUs that require a custom + * MMIO value are not susceptible to L1TF. + */ + if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << + SHADOW_NONPRESENT_OR_RSVD_MASK_LEN))) + mmio_value = 0; + + /* + * The masked MMIO value must obviously match itself and a removed SPTE + * must not get a false positive. Removed SPTEs and MMIO SPTEs should + * never collide as MMIO must set some RWX bits, and removed SPTEs must + * not set any RWX bits. + */ + if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || + WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value)) + mmio_value = 0; + + shadow_mmio_value = mmio_value; + shadow_mmio_mask = mmio_mask; shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -/* - * Sets the shadow PTE masks used by the MMU. - * - * Assumptions: - * - Setting either @accessed_mask or @dirty_mask requires setting both - * - At least one of @accessed_mask or @acc_track_mask must be set - */ -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask) +void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { - BUG_ON(!dirty_mask != !accessed_mask); - BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); - - shadow_user_mask = user_mask; - shadow_accessed_mask = accessed_mask; - shadow_dirty_mask = dirty_mask; - shadow_nx_mask = nx_mask; - shadow_x_mask = x_mask; - shadow_present_mask = p_mask; - shadow_acc_track_mask = acc_track_mask; - shadow_me_mask = me_mask; + shadow_user_mask = VMX_EPT_READABLE_MASK; + shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull; + shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; + shadow_nx_mask = 0ull; + shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; + shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + shadow_acc_track_mask = VMX_EPT_RWX_MASK; + shadow_me_mask = 0ull; + + shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; + shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; + + /* + * EPT Misconfigurations are generated if the value of bits 2:0 + * of an EPT paging-structure entry is 110b (write/execute). + */ + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, + VMX_EPT_RWX_MASK, 0); } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; - - shadow_user_mask = 0; - shadow_accessed_mask = 0; - shadow_dirty_mask = 0; - shadow_nx_mask = 0; - shadow_x_mask = 0; - shadow_present_mask = 0; - shadow_acc_track_mask = 0; + u64 mask; shadow_phys_bits = kvm_get_shadow_phys_bits(); @@ -315,4 +346,30 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); + + shadow_user_mask = PT_USER_MASK; + shadow_accessed_mask = PT_ACCESSED_MASK; + shadow_dirty_mask = PT_DIRTY_MASK; + shadow_nx_mask = PT64_NX_MASK; + shadow_x_mask = 0; + shadow_present_mask = PT_PRESENT_MASK; + shadow_acc_track_mask = 0; + shadow_me_mask = sme_me_mask; + + shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE; + shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE; + + /* + * Set a reserved PA bit in MMIO SPTEs to generate page faults with + * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT + * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports + * 52-bit physical addresses then there are no reserved PA bits in the + * PTEs and so the reserved PA approach must be disabled. + */ + if (shadow_phys_bits < 52) + mask = BIT_ULL(51) | PT_PRESENT_MASK; + else + mask = 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 6de3950fd704..bca0ba11cccf 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -5,18 +5,33 @@ #include "mmu_internal.h" -#define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 54 +/* + * A MMU present SPTE is backed by actual memory and may or may not be present + * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it + * is ignored by all flavors of SPTEs and checking a low bit often generates + * better code than for a high bit, e.g. 56+. MMU present checks are pervasive + * enough that the improved code generation is noticeable in KVM's footprint. + */ +#define SPTE_MMU_PRESENT_MASK BIT_ULL(11) /* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. + * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also + * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. + * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that + * is must be employed for a given TDP SPTE. + * + * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE + * paging, including NPT PAE. This scheme works because legacy shadow paging + * is guaranteed to have A/D bits and write-protection is forced only for + * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it + * must be restricted to 64-bit KVM. */ -#define SPTE_SPECIAL_MASK (3ULL << 52) -#define SPTE_AD_ENABLED_MASK (0ULL << 52) -#define SPTE_AD_DISABLED_MASK (1ULL << 52) -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) -#define SPTE_MMIO_MASK (3ULL << 52) +#define SPTE_TDP_AD_SHIFT 52 +#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT) +static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) @@ -51,16 +66,46 @@ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ +#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) +#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. This mask obviously + * must not overlap the A/D type mask. + */ +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ + PT64_EPT_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 +#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ + SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) +static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); + +/* + * Low ignored bits are at a premium for EPT, use high ignored bits, taking care + * to not overlap the A/D type mask or the saved access bits of access-tracked + * SPTEs when A/D bits are disabled. + */ +#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57) +#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58) -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK)); +static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK)); +static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); +static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); + +/* Defined only to keep the above static asserts readable. */ +#undef SHADOW_ACC_TRACK_SAVED_MASK /* - * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of * the memslots generation and is derived as follows: * - * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 - * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62 + * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 + * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 * * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in * the MMIO generation number, as doing so would require stealing a bit from @@ -71,39 +116,44 @@ */ #define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_END 10 -#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_START 52 #define MMIO_SPTE_GEN_HIGH_END 62 #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ MMIO_SPTE_GEN_LOW_START) #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ MMIO_SPTE_GEN_HIGH_START) +static_assert(!(SPTE_MMU_PRESENT_MASK & + (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) /* remember to adjust the comment above as well if you change these */ -static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9); +static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) +extern u64 __read_mostly shadow_host_writable_mask; +extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ extern u64 __read_mostly shadow_user_mask; extern u64 __read_mostly shadow_accessed_mask; extern u64 __read_mostly shadow_dirty_mask; extern u64 __read_mostly shadow_mmio_value; +extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; extern u64 __read_mostly shadow_me_mask; /* - * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK; * shadow_acc_track_mask is the set of bits to be cleared in non-accessed * pages. */ @@ -121,28 +171,21 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5 /* - * The mask/shift to use for saving the original R/X bits when marking the PTE - * as not-present for access tracking purposes. We do not save the W bit as the - * PTEs being access tracked also need to be dirty tracked, so the W bit will be - * restored only when a write is attempted to the page. - */ -#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ - PT64_EPT_EXECUTABLE_MASK) -#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT - -/* * If a thread running without exclusive control of the MMU lock must perform a * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a * non-present intermediate value. Other threads which encounter this value * should not modify the SPTE. * - * This constant works because it is considered non-present on both AMD and - * Intel CPUs and does not create a L1TF vulnerability because the pfn section - * is zeroed out. + * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on + * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF + * vulnerability. Use only low bits to avoid 64-bit immediates. * * Only used by the TDP MMU. */ -#define REMOVED_SPTE (1ull << 59) +#define REMOVED_SPTE 0x5a0ULL + +/* Removed SPTEs must not be misconstrued as shadow present PTEs. */ +static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); static inline bool is_removed_spte(u64 spte) { @@ -167,7 +210,13 @@ extern u8 __read_mostly shadow_phys_bits; static inline bool is_mmio_spte(u64 spte) { - return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; + return (spte & shadow_mmio_mask) == shadow_mmio_value && + likely(shadow_mmio_value); +} + +static inline bool is_shadow_present_pte(u64 pte) +{ + return !!(pte & SPTE_MMU_PRESENT_MASK); } static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) @@ -177,25 +226,30 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; + MMU_WARN_ON(!is_shadow_present_pte(spte)); + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK; } static inline bool spte_ad_need_write_protect(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; + MMU_WARN_ON(!is_shadow_present_pte(spte)); + /* + * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0', + * and non-TDP SPTEs will never set these bits. Optimize for 64-bit + * TDP and do the A/D type check unconditionally. + */ + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK; } static inline u64 spte_shadow_accessed_mask(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); + MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } static inline u64 spte_shadow_dirty_mask(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); + MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } @@ -204,11 +258,6 @@ static inline bool is_access_track_spte(u64 spte) return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } -static inline bool is_shadow_present_pte(u64 pte) -{ - return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte); -} - static inline bool is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; @@ -246,8 +295,8 @@ static inline bool is_dirty_spte(u64 spte) static inline bool spte_can_locklessly_be_made_writable(u64 spte) { - return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == - (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); + return (spte & shadow_host_writable_mask) && + (spte & shadow_mmu_writable_mask); } static inline u64 get_mmio_spte_generation(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index e5f148106e20..b3ed302c1a35 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -21,6 +21,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) } /* + * Return the TDP iterator to the root PT and allow it to continue its + * traversal over the paging structure from there. + */ +void tdp_iter_restart(struct tdp_iter *iter) +{ + iter->yielded_gfn = iter->next_last_level_gfn; + iter->level = iter->root_level; + + iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + +/* * Sets a TDP iterator to walk a pre-order traversal of the paging structure * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ @@ -31,16 +46,12 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); iter->next_last_level_gfn = next_last_level_gfn; - iter->yielded_gfn = iter->next_last_level_gfn; iter->root_level = root_level; iter->min_level = min_level; - iter->level = root_level; - iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt; - - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); - tdp_iter_refresh_sptep(iter); + iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt; + iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt)); - iter->valid = true; + tdp_iter_restart(iter); } /* @@ -159,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter) iter->valid = false; } -tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter) -{ - return iter->pt_path[iter->root_level - 1]; -} - diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 4cc177d75c4a..b1748b988d3a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -36,6 +36,8 @@ struct tdp_iter { int min_level; /* The iterator's current level within the paging structure */ int level; + /* The address space ID, i.e. SMM vs. regular. */ + int as_id; /* A snapshot of the value at sptep */ u64 old_spte; /* @@ -62,6 +64,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); -tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter); +void tdp_iter_restart(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d78915019b08..fd5000863678 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -86,7 +86,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield); + gfn_t start, gfn_t end, bool can_yield, bool flush); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) { @@ -99,7 +99,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) list_del(&root->link); - zap_gfn_range(kvm, root, 0, max_gfn, false); + zap_gfn_range(kvm, root, 0, max_gfn, false, false); free_page((unsigned long)root->spt); kmem_cache_free(mmu_page_header_cache, root); @@ -137,22 +137,21 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, return sp; } -static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) { union kvm_mmu_page_role role; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; - role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); + lockdep_assert_held_write(&kvm->mmu_lock); - write_lock(&kvm->mmu_lock); + role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); /* Check for an existing root before allocating a new one. */ for_each_tdp_mmu_root(kvm, root) { if (root->role.word == role.word) { kvm_mmu_get_root(kvm, root); - write_unlock(&kvm->mmu_lock); - return root; + goto out; } } @@ -161,19 +160,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) list_add(&root->link, &kvm->arch.tdp_mmu_roots); - write_unlock(&kvm->mmu_lock); - - return root; -} - -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *root; - - root = get_tdp_mmu_vcpu_root(vcpu); - if (!root) - return INVALID_PAGE; - +out: return __pa(root->spt); } @@ -203,20 +190,14 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); -static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) -{ - return sp->role.smm ? 1 : 0; -} - static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) { - bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) return; if (is_accessed_spte(old_spte) && - (!is_accessed_spte(new_spte) || pfn_changed)) + (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || + spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } @@ -301,11 +282,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, * * Given a page table that has been removed from the TDP paging structure, * iterates through the page table to clear SPTEs and free child page tables. + * + * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU + * protection. Since this thread removed it from the paging structure, + * this thread will be responsible for ensuring the page is freed. Hence the + * early rcu_dereferences in the function. */ -static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, +static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, bool shared) { - struct kvm_mmu_page *sp = sptep_to_sp(pt); + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; u64 old_child_spte; @@ -318,7 +304,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - sptep = pt + i; + sptep = rcu_dereference(pt) + i; gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); if (shared) { @@ -455,7 +441,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (was_leaf && is_dirty_spte(old_spte) && - (!is_dirty_spte(new_spte) || pfn_changed)) + (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) kvm_set_pfn_dirty(spte_to_pfn(old_spte)); /* @@ -492,25 +478,21 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { - u64 *root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); - lockdep_assert_held_read(&kvm->mmu_lock); /* * Do not change removed SPTEs. Only the thread that froze the SPTE * may modify it. */ - if (iter->old_spte == REMOVED_SPTE) + if (is_removed_spte(iter->old_spte)) return false; if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, new_spte) != iter->old_spte) return false; - handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level, true); + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); return true; } @@ -538,7 +520,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * here since the SPTE is going from non-present * to non-present. */ - WRITE_ONCE(*iter->sptep, 0); + WRITE_ONCE(*rcu_dereference(iter->sptep), 0); return true; } @@ -564,10 +546,6 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { - tdp_ptep_t root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); - lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -577,17 +555,17 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, * should be used. If operating under the MMU lock in write mode, the * use of the removed SPTE should not be necessary. */ - WARN_ON(iter->old_spte == REMOVED_SPTE); + WARN_ON(is_removed_spte(iter->old_spte)); WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); - __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level, false); + __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, false); if (record_acc_track) handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level); } @@ -659,9 +637,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, WARN_ON(iter->gfn > iter->next_last_level_gfn); - tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], - iter->root_level, iter->min_level, - iter->next_last_level_gfn); + tdp_iter_restart(iter); return true; } @@ -678,20 +654,21 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the - * operation can cause a soft lockup. + * operation can cause a soft lockup. Note, in some use cases a flush may be + * required by prior actions. Ensure the pending flush is performed prior to + * yielding. */ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield) + gfn_t start, gfn_t end, bool can_yield, bool flush) { struct tdp_iter iter; - bool flush_needed = false; rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { if (can_yield && - tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { - flush_needed = false; + tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { + flush = false; continue; } @@ -709,11 +686,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; tdp_mmu_set_spte(kvm, &iter, 0); - flush_needed = true; + flush = true; } rcu_read_unlock(); - return flush_needed; + return flush; } /* @@ -722,13 +699,14 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. */ -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield) { struct kvm_mmu_page *root; bool flush = false; for_each_tdp_mmu_root_yield_safe(kvm, root) - flush |= zap_gfn_range(kvm, root, start, end, true); + flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); return flush; } @@ -785,12 +763,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, new_spte); ret = RET_PF_EMULATE; - } else + } else { trace_kvm_mmu_set_spte(iter->level, iter->gfn, rcu_dereference(iter->sptep)); + } - trace_kvm_mmu_set_spte(iter->level, iter->gfn, - rcu_dereference(iter->sptep)); if (!prefault) vcpu->stat.pf_fixed++; @@ -890,17 +867,15 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return ret; } -static __always_inline int -kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, - gfn_t start, - gfn_t end, - unsigned long data)) +typedef int (*tdp_handler_t)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, gfn_t end, + unsigned long data); + +static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + tdp_handler_t handler) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -935,12 +910,20 @@ kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, return ret; } +static __always_inline int kvm_tdp_mmu_handle_hva(struct kvm *kvm, + unsigned long addr, + unsigned long data, + tdp_handler_t handler) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, addr, addr + 1, data, handler); +} + static int zap_gfn_range_hva_wrapper(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_mmu_page *root, gfn_t start, gfn_t end, unsigned long unused) { - return zap_gfn_range(kvm, root, start, end, false); + return zap_gfn_range(kvm, root, start, end, false, false); } int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, @@ -1008,12 +991,12 @@ int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, } static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, - unsigned long unused2) + struct kvm_mmu_page *root, gfn_t gfn, gfn_t end, + unsigned long unused) { struct tdp_iter iter; - tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) + tdp_root_for_each_leaf_pte(iter, root, gfn, end) if (is_accessed_spte(iter.old_spte)) return 1; @@ -1022,8 +1005,7 @@ static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, - test_age_gfn); + return kvm_tdp_mmu_handle_hva(kvm, hva, 0, test_age_gfn); } /* @@ -1033,7 +1015,7 @@ int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) * Returns non-zero if a flush is needed before releasing the MMU lock. */ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t end, unsigned long data) { struct tdp_iter iter; @@ -1044,7 +1026,7 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, rcu_read_lock(); - WARN_ON(pte_huge(*ptep)); + WARN_ON(pte_huge(*ptep) || (gfn + 1) != end); new_pfn = pte_pfn(*ptep); @@ -1055,10 +1037,14 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, if (!is_shadow_present_pte(iter.old_spte)) break; + /* + * Note, when changing a read-only SPTE, it's not strictly + * necessary to zero the SPTE before setting the new PFN, but + * doing so preserves the invariant that the PFN of a present + * leaf SPTE can never change. See __handle_changed_spte(). + */ tdp_mmu_set_spte(kvm, &iter, 0); - kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); - if (!pte_write(*ptep)) { new_spte = kvm_mmu_changed_pte_notifier_make_spte( iter.old_spte, new_pfn); @@ -1080,9 +1066,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, pte_t *host_ptep) { - return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, - (unsigned long)host_ptep, - set_tdp_spte); + return kvm_tdp_mmu_handle_hva(kvm, address, (unsigned long)host_ptep, + set_tdp_spte); } /* @@ -1342,7 +1327,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, /* * Removes write access on the last level SPTE mapping this GFN and unsets the - * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, @@ -1359,7 +1344,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, break; new_spte = iter.old_spte & - ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); tdp_mmu_set_spte(kvm, &iter, new_spte); spte_set = true; @@ -1372,7 +1357,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, /* * Removes write access on the last level SPTE mapping this GFN and unsets the - * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 3b761c111bff..31096ece9b14 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -8,7 +8,29 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield); +static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, + gfn_t end) +{ + return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true); +} +static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + + /* + * Don't allow yielding, as the caller may have a flush pending. Note, + * if mmu_lock is held for write, zapping will never yield in this case, + * but explicitly disallow it for safety. The TDP MMU does not yield + * until it has made forward progress (steps sideways), and when zapping + * a single shadow page that it's guaranteed to see (thus the mmu_lock + * requirement), its "step sideways" will always step beyond the bounds + * of the shadow page's gfn range and stop iterating before yielding. + */ + lockdep_assert_held_write(&kvm->mmu_lock); + return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false); +} void kvm_tdp_mmu_zap_all(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 78bdcfac4e40..cd0285f15a68 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -270,7 +270,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) return -EINVAL; - if (!svm->vcpu.arch.apic->regs) + if (!vcpu->arch.apic->regs) return -EINVAL; if (kvm_apicv_activated(vcpu->kvm)) { @@ -281,7 +281,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return ret; } - svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); + svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs); /* Setting AVIC backing page address in the phy APIC ID table */ entry = avic_get_physical_id_entry(vcpu, id); @@ -315,15 +315,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, } } -int avic_incomplete_ipi_interception(struct vcpu_svm *svm) +int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); u32 icrh = svm->vmcb->control.exit_info_1 >> 32; u32 icrl = svm->vmcb->control.exit_info_1; u32 id = svm->vmcb->control.exit_info_2 >> 32; u32 index = svm->vmcb->control.exit_info_2 & 0xFF; - struct kvm_lapic *apic = svm->vcpu.arch.apic; + struct kvm_lapic *apic = vcpu->arch.apic; - trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); + trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); switch (id) { case AVIC_IPI_FAILURE_INVALID_INT_TYPE: @@ -347,11 +348,11 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm) * set the appropriate IRR bits on the valid target * vcpus. So, we just need to kick the appropriate vcpu. */ - avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh); + avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh); break; case AVIC_IPI_FAILURE_INVALID_TARGET: WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", - index, svm->vcpu.vcpu_id, icrh, icrl); + index, vcpu->vcpu_id, icrh, icrl); break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); @@ -539,8 +540,9 @@ static bool is_avic_unaccelerated_access_trap(u32 offset) return ret; } -int avic_unaccelerated_access_interception(struct vcpu_svm *svm) +int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); int ret = 0; u32 offset = svm->vmcb->control.exit_info_1 & AVIC_UNACCEL_ACCESS_OFFSET_MASK; @@ -550,7 +552,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm) AVIC_UNACCEL_ACCESS_WRITE_MASK; bool trap = is_avic_unaccelerated_access_trap(offset); - trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset, + trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset, trap, write, vector); if (trap) { /* Handling Trap */ @@ -558,7 +560,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm) ret = avic_unaccel_trap_write(svm); } else { /* Handling Fault */ - ret = kvm_emulate_instruction(&svm->vcpu, 0); + ret = kvm_emulate_instruction(vcpu, 0); } return ret; @@ -572,7 +574,7 @@ int avic_init_vcpu(struct vcpu_svm *svm) if (!avic || !irqchip_in_kernel(vcpu->kvm)) return 0; - ret = avic_init_backing_page(&svm->vcpu); + ret = avic_init_backing_page(vcpu); if (ret) return ret; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 35891d9a1099..9bed4847ad55 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -29,6 +29,8 @@ #include "lapic.h" #include "svm.h" +#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK + static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -92,12 +94,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *hsave = svm->nested.hsave; WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer, + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4, + svm->vmcb01.ptr->save.efer, svm->nested.ctl.nested_cr3); vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; @@ -123,7 +125,7 @@ void recalc_intercepts(struct vcpu_svm *svm) return; c = &svm->vmcb->control; - h = &svm->nested.hsave->control; + h = &svm->vmcb01.ptr->control; g = &svm->nested.ctl; for (i = 0; i < MAX_INTERCEPT; i++) @@ -233,49 +235,71 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) static bool nested_vmcb_check_controls(struct vmcb_control_area *control) { - if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0) + if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN))) return false; - if (control->asid == 0) + if (CC(control->asid == 0)) return false; - if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && - !npt_enabled) + if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled)) return false; return true; } -static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) +static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, + struct vmcb_save_area *save) { - struct kvm_vcpu *vcpu = &svm->vcpu; - bool vmcb12_lma; + /* + * These checks are also performed by KVM_SET_SREGS, + * except that EFER.LMA is not checked by SVM against + * CR0.PG && EFER.LME. + */ + if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) { + if (CC(!(save->cr4 & X86_CR4_PAE)) || + CC(!(save->cr0 & X86_CR0_PE)) || + CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3))) + return false; + } - if ((vmcb12->save.efer & EFER_SVME) == 0) + if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) return false; - if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW)) + return true; +} + +/* Common checks that apply to both L1 and L2 state. */ +static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, + struct vmcb_save_area *save) +{ + /* + * FIXME: these should be done after copying the fields, + * to avoid TOC/TOU races. For these save area checks + * the possible damage is limited since kvm_set_cr0 and + * kvm_set_cr4 handle failure; EFER_SVME is an exception + * so it is force-set later in nested_prepare_vmcb_save. + */ + if (CC(!(save->efer & EFER_SVME))) return false; - if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7)) + if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || + CC(save->cr0 & ~0xffffffffULL)) return false; - vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG); + if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) + return false; - if (vmcb12_lma) { - if (!(vmcb12->save.cr4 & X86_CR4_PAE) || - !(vmcb12->save.cr0 & X86_CR0_PE) || - kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3)) - return false; - } - if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) + if (!nested_vmcb_check_cr3_cr4(vcpu, save)) + return false; + + if (CC(!kvm_valid_efer(vcpu, save->efer))) return false; - return nested_vmcb_check_controls(&vmcb12->control); + return true; } -static void load_nested_vmcb_control(struct vcpu_svm *svm, - struct vmcb_control_area *control) +static void nested_load_control_from_vmcb12(struct vcpu_svm *svm, + struct vmcb_control_area *control) { copy_vmcb_control_area(&svm->nested.ctl, control); @@ -287,9 +311,9 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm, /* * Synchronize fields that are written by the processor, so that - * they can be copied back into the nested_vmcb. + * they can be copied back into the vmcb12. */ -void sync_nested_vmcb_control(struct vcpu_svm *svm) +void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) { u32 mask; svm->nested.ctl.event_inj = svm->vmcb->control.event_inj; @@ -317,8 +341,8 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm) * Transfer any event that L0 or L1 wanted to inject into L2 to * EXIT_INT_INFO. */ -static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, - struct vmcb *vmcb12) +static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, + struct vmcb *vmcb12) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 exit_int_info = 0; @@ -362,12 +386,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_npt) { - if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) return -EINVAL; if (!nested_npt && is_pae_paging(vcpu) && (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { - if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) return -EINVAL; } @@ -386,20 +410,57 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return 0; } -static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) +void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) { + if (!svm->nested.vmcb02.ptr) + return; + + /* FIXME: merge g_pat from vmcb01 and vmcb12. */ + svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat; +} + +static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) +{ + bool new_vmcb12 = false; + + nested_vmcb02_compute_g_pat(svm); + /* Load the nested guest state */ - svm->vmcb->save.es = vmcb12->save.es; - svm->vmcb->save.cs = vmcb12->save.cs; - svm->vmcb->save.ss = vmcb12->save.ss; - svm->vmcb->save.ds = vmcb12->save.ds; - svm->vmcb->save.gdtr = vmcb12->save.gdtr; - svm->vmcb->save.idtr = vmcb12->save.idtr; + + if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { + new_vmcb12 = true; + svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa; + } + + if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { + svm->vmcb->save.es = vmcb12->save.es; + svm->vmcb->save.cs = vmcb12->save.cs; + svm->vmcb->save.ss = vmcb12->save.ss; + svm->vmcb->save.ds = vmcb12->save.ds; + svm->vmcb->save.cpl = vmcb12->save.cpl; + vmcb_mark_dirty(svm->vmcb, VMCB_SEG); + } + + if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) { + svm->vmcb->save.gdtr = vmcb12->save.gdtr; + svm->vmcb->save.idtr = vmcb12->save.idtr; + vmcb_mark_dirty(svm->vmcb, VMCB_DT); + } + kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - svm_set_efer(&svm->vcpu, vmcb12->save.efer); + + /* + * Force-set EFER_SVME even though it is checked earlier on the + * VMCB12, because the guest can flip the bit between the check + * and now. Clearing EFER_SVME would call svm_free_nested. + */ + svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); + svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2; + + svm->vcpu.arch.cr2 = vmcb12->save.cr2; + kvm_rax_write(&svm->vcpu, vmcb12->save.rax); kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp); kvm_rip_write(&svm->vcpu, vmcb12->save.rip); @@ -408,15 +469,41 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.rax = vmcb12->save.rax; svm->vmcb->save.rsp = vmcb12->save.rsp; svm->vmcb->save.rip = vmcb12->save.rip; - svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; - svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; - svm->vmcb->save.cpl = vmcb12->save.cpl; + + /* These bits will be set properly on the first execution when new_vmc12 is true */ + if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { + svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; + svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; + vmcb_mark_dirty(svm->vmcb, VMCB_DR); + } } -static void nested_prepare_vmcb_control(struct vcpu_svm *svm) +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; + /* + * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, + * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. + */ + + /* + * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id, + * avic_physical_id. + */ + WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK); + + /* Copied from vmcb01. msrpm_base can be overwritten later. */ + svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl; + svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa; + svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa; + + /* Done at vmrun: asid. */ + + /* Also overwritten later if necessary. */ + svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + + /* nested_cr3. */ if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(&svm->vcpu); @@ -425,7 +512,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & ~mask) | - (svm->nested.hsave->control.int_ctl & mask); + (svm->vmcb01.ptr->control.int_ctl & mask); svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; @@ -440,17 +527,28 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) enter_guest_mode(&svm->vcpu); /* - * Merge guest and host intercepts - must be called with vcpu in - * guest-mode to take affect here + * Merge guest and host intercepts - must be called with vcpu in + * guest-mode to take effect. */ recalc_intercepts(svm); +} - vmcb_mark_all_dirty(svm->vmcb); +static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +{ + /* + * Some VMCB state is shared between L1 and L2 and thus has to be + * moved at the time of nested vmrun and vmexit. + * + * VMLOAD/VMSAVE state would also belong in this category, but KVM + * always performs VMLOAD and VMSAVE from the VMCB01. + */ + to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl; } -int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, +int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, struct vmcb *vmcb12) { + struct vcpu_svm *svm = to_svm(vcpu); int ret; trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, @@ -468,9 +566,14 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, svm->nested.vmcb12_gpa = vmcb12_gpa; - load_nested_vmcb_control(svm, &vmcb12->control); - nested_prepare_vmcb_control(svm); - nested_prepare_vmcb_save(svm, vmcb12); + + WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr); + + nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); + + svm_switch_vmcb(svm, &svm->nested.vmcb02); + nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, nested_npt_enabled(svm)); @@ -478,44 +581,48 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, return ret; if (!npt_enabled) - svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested; + vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested; svm_set_gif(svm, true); return 0; } -int nested_svm_vmrun(struct vcpu_svm *svm) +int nested_svm_vmrun(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); int ret; struct vmcb *vmcb12; - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; u64 vmcb12_gpa; - if (is_smm(&svm->vcpu)) { - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + ++vcpu->stat.nested_run; + + if (is_smm(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } vmcb12_gpa = svm->vmcb->save.rax; - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map); + ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map); if (ret == -EINVAL) { - kvm_inject_gp(&svm->vcpu, 0); + kvm_inject_gp(vcpu, 0); return 1; } else if (ret) { - return kvm_skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(vcpu); } - ret = kvm_skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(vcpu); vmcb12 = map.hva; if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - if (!nested_vmcb_checks(svm, vmcb12)) { + nested_load_control_from_vmcb12(svm, &vmcb12->control); + + if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || + !nested_vmcb_check_controls(&svm->nested.ctl)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; @@ -525,36 +632,25 @@ int nested_svm_vmrun(struct vcpu_svm *svm) /* Clear internal status */ - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); /* - * Save the old vmcb, so we don't need to pick what we save, but can - * restore everything when a VMEXIT occurs + * Since vmcb01 is not in use, we can use it to store some of the L1 + * state. */ - hsave->save.es = vmcb->save.es; - hsave->save.cs = vmcb->save.cs; - hsave->save.ss = vmcb->save.ss; - hsave->save.ds = vmcb->save.ds; - hsave->save.gdtr = vmcb->save.gdtr; - hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.efer; - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); - hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = kvm_get_rflags(&svm->vcpu); - hsave->save.rip = kvm_rip_read(&svm->vcpu); - hsave->save.rsp = vmcb->save.rsp; - hsave->save.rax = vmcb->save.rax; - if (npt_enabled) - hsave->save.cr3 = vmcb->save.cr3; - else - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); - - copy_vmcb_control_area(&hsave->control, &vmcb->control); + svm->vmcb01.ptr->save.efer = vcpu->arch.efer; + svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu); + svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4; + svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu); + svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu); + + if (!npt_enabled) + svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu); svm->nested.nested_run_pending = 1; - if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12)) + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12)) goto out_exit_err; if (nested_svm_vmrun_msrpm(svm)) @@ -571,7 +667,7 @@ out_exit_err: nested_svm_vmexit(svm); out: - kvm_vcpu_unmap(&svm->vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map, true); return ret; } @@ -594,27 +690,30 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) int nested_svm_vmexit(struct vcpu_svm *svm) { - int rc; + struct kvm_vcpu *vcpu = &svm->vcpu; struct vmcb *vmcb12; - struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; + int rc; + + /* Triple faults in L2 should never escape. */ + WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); + rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); if (rc) { if (rc == -EINVAL) - kvm_inject_gp(&svm->vcpu, 0); + kvm_inject_gp(vcpu, 0); return 1; } vmcb12 = map.hva; /* Exit Guest-Mode */ - leave_guest_mode(&svm->vcpu); + leave_guest_mode(vcpu); svm->nested.vmcb12_gpa = 0; WARN_ON_ONCE(svm->nested.nested_run_pending); - kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* in case we halted in L2 */ svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -628,14 +727,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->save.gdtr = vmcb->save.gdtr; vmcb12->save.idtr = vmcb->save.idtr; vmcb12->save.efer = svm->vcpu.arch.efer; - vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu); - vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu); + vmcb12->save.cr0 = kvm_read_cr0(vcpu); + vmcb12->save.cr3 = kvm_read_cr3(vcpu); vmcb12->save.cr2 = vmcb->save.cr2; vmcb12->save.cr4 = svm->vcpu.arch.cr4; - vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu); - vmcb12->save.rip = kvm_rip_read(&svm->vcpu); - vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu); - vmcb12->save.rax = kvm_rax_read(&svm->vcpu); + vmcb12->save.rflags = kvm_get_rflags(vcpu); + vmcb12->save.rip = kvm_rip_read(vcpu); + vmcb12->save.rsp = kvm_rsp_read(vcpu); + vmcb12->save.rax = kvm_rax_read(vcpu); vmcb12->save.dr7 = vmcb->save.dr7; vmcb12->save.dr6 = svm->vcpu.arch.dr6; vmcb12->save.cpl = vmcb->save.cpl; @@ -647,7 +746,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_info_2 = vmcb->control.exit_info_2; if (vmcb12->control.exit_code != SVM_EXIT_ERR) - nested_vmcb_save_pending_event(svm, vmcb12); + nested_save_pending_event_to_vmcb12(svm, vmcb12); if (svm->nrips_enabled) vmcb12->control.next_rip = vmcb->control.next_rip; @@ -662,37 +761,38 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.pause_filter_thresh = svm->vmcb->control.pause_filter_thresh; - /* Restore the original control entries */ - copy_vmcb_control_area(&vmcb->control, &hsave->control); + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + + svm_switch_vmcb(svm, &svm->vmcb01); - /* On vmexit the GIF is set to false */ + /* + * On vmexit the GIF is set to false and + * no event can be injected in L1. + */ svm_set_gif(svm, false); + svm->vmcb->control.exit_int_info = 0; - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = - svm->vcpu.arch.l1_tsc_offset; + svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset; + if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) { + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + } svm->nested.ctl.nested_cr3 = 0; - /* Restore selected save entries */ - svm->vmcb->save.es = hsave->save.es; - svm->vmcb->save.cs = hsave->save.cs; - svm->vmcb->save.ss = hsave->save.ss; - svm->vmcb->save.ds = hsave->save.ds; - svm->vmcb->save.gdtr = hsave->save.gdtr; - svm->vmcb->save.idtr = hsave->save.idtr; - kvm_set_rflags(&svm->vcpu, hsave->save.rflags); - kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED); - svm_set_efer(&svm->vcpu, hsave->save.efer); - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); - svm_set_cr4(&svm->vcpu, hsave->save.cr4); - kvm_rax_write(&svm->vcpu, hsave->save.rax); - kvm_rsp_write(&svm->vcpu, hsave->save.rsp); - kvm_rip_write(&svm->vcpu, hsave->save.rip); - svm->vmcb->save.dr7 = DR7_FIXED_1; - svm->vmcb->save.cpl = 0; - svm->vmcb->control.exit_int_info = 0; + /* + * Restore processor state that had been saved in vmcb01 + */ + kvm_set_rflags(vcpu, svm->vmcb->save.rflags); + svm_set_efer(vcpu, svm->vmcb->save.efer); + svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE); + svm_set_cr4(vcpu, svm->vmcb->save.cr4); + kvm_rax_write(vcpu, svm->vmcb->save.rax); + kvm_rsp_write(vcpu, svm->vmcb->save.rsp); + kvm_rip_write(vcpu, svm->vmcb->save.rip); - vmcb_mark_all_dirty(svm->vmcb); + svm->vcpu.arch.dr7 = DR7_FIXED_1; + kvm_update_dr7(&svm->vcpu); trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, vmcb12->control.exit_info_1, @@ -701,50 +801,53 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_int_info_err, KVM_ISA_SVM); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map, true); - nested_svm_uninit_mmu_context(&svm->vcpu); + nested_svm_uninit_mmu_context(vcpu); - rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false); + rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false); if (rc) return 1; - if (npt_enabled) - svm->vmcb->save.cr3 = hsave->save.cr3; - /* * Drop what we picked up for L2 via svm_complete_interrupts() so it * doesn't end up in L1. */ svm->vcpu.arch.nmi_injected = false; - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); return 0; } +static void nested_svm_triple_fault(struct kvm_vcpu *vcpu) +{ + nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN); +} + int svm_allocate_nested(struct vcpu_svm *svm) { - struct page *hsave_page; + struct page *vmcb02_page; if (svm->nested.initialized) return 0; - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!hsave_page) + vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!vmcb02_page) return -ENOMEM; - svm->nested.hsave = page_address(hsave_page); + svm->nested.vmcb02.ptr = page_address(vmcb02_page); + svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT); svm->nested.msrpm = svm_vcpu_alloc_msrpm(); if (!svm->nested.msrpm) - goto err_free_hsave; + goto err_free_vmcb02; svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm); svm->nested.initialized = true; return 0; -err_free_hsave: - __free_page(hsave_page); +err_free_vmcb02: + __free_page(vmcb02_page); return -ENOMEM; } @@ -756,8 +859,8 @@ void svm_free_nested(struct vcpu_svm *svm) svm_vcpu_free_msrpm(svm->nested.msrpm); svm->nested.msrpm = NULL; - __free_page(virt_to_page(svm->nested.hsave)); - svm->nested.hsave = NULL; + __free_page(virt_to_page(svm->nested.vmcb02.ptr)); + svm->nested.vmcb02.ptr = NULL; svm->nested.initialized = false; } @@ -767,18 +870,19 @@ void svm_free_nested(struct vcpu_svm *svm) */ void svm_leave_nested(struct vcpu_svm *svm) { - if (is_guest_mode(&svm->vcpu)) { - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; + struct kvm_vcpu *vcpu = &svm->vcpu; + if (is_guest_mode(vcpu)) { svm->nested.nested_run_pending = 0; - leave_guest_mode(&svm->vcpu); - copy_vmcb_control_area(&vmcb->control, &hsave->control); - nested_svm_uninit_mmu_context(&svm->vcpu); + leave_guest_mode(vcpu); + + svm_switch_vmcb(svm, &svm->nested.vmcb02); + + nested_svm_uninit_mmu_context(vcpu); vmcb_mark_all_dirty(svm->vmcb); } - kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) @@ -887,16 +991,15 @@ int nested_svm_exit_handled(struct vcpu_svm *svm) return vmexit; } -int nested_svm_check_permissions(struct vcpu_svm *svm) +int nested_svm_check_permissions(struct kvm_vcpu *vcpu) { - if (!(svm->vcpu.arch.efer & EFER_SVME) || - !is_paging(&svm->vcpu)) { - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (svm->vmcb->save.cpl) { - kvm_inject_gp(&svm->vcpu, 0); + if (to_svm(vcpu)->vmcb->save.cpl) { + kvm_inject_gp(vcpu, 0); return 1; } @@ -944,50 +1047,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) nested_svm_vmexit(svm); } -static void nested_svm_smi(struct vcpu_svm *svm) -{ - svm->vmcb->control.exit_code = SVM_EXIT_SMI; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - -static void nested_svm_nmi(struct vcpu_svm *svm) -{ - svm->vmcb->control.exit_code = SVM_EXIT_NMI; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - -static void nested_svm_intr(struct vcpu_svm *svm) -{ - trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); - - svm->vmcb->control.exit_code = SVM_EXIT_INTR; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - static inline bool nested_exit_on_init(struct vcpu_svm *svm) { return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } -static void nested_svm_init(struct vcpu_svm *svm) -{ - svm->vmcb->control.exit_code = SVM_EXIT_INIT; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - - static int svm_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1001,7 +1065,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_init(svm)) return 0; - nested_svm_init(svm); + nested_svm_simple_vmexit(svm, SVM_EXIT_INIT); return 0; } @@ -1019,7 +1083,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_smi(svm)) return 0; - nested_svm_smi(svm); + nested_svm_simple_vmexit(svm, SVM_EXIT_SMI); return 0; } @@ -1028,7 +1092,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_nmi(svm)) return 0; - nested_svm_nmi(svm); + nested_svm_simple_vmexit(svm, SVM_EXIT_NMI); return 0; } @@ -1037,7 +1101,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_intr(svm)) return 0; - nested_svm_intr(svm); + trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); + nested_svm_simple_vmexit(svm, SVM_EXIT_INTR); return 0; } @@ -1056,8 +1121,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm) case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] & - excp_bits) + if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] & + excp_bits) return NESTED_EXIT_HOST; else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && svm->vcpu.arch.apf.host_apf_flags) @@ -1121,10 +1186,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, if (copy_to_user(&user_vmcb->control, &svm->nested.ctl, sizeof(user_vmcb->control))) return -EFAULT; - if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save, + if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save, sizeof(user_vmcb->save))) return -EFAULT; - out: return kvm_state.size; } @@ -1134,7 +1198,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *kvm_state) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *hsave = svm->nested.hsave; struct vmcb __user *user_vmcb = (struct vmcb __user *) &user_kvm_nested_state->data.svm[0]; struct vmcb_control_area *ctl; @@ -1196,7 +1259,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, /* * Processor state contains L2 state. Check that it is - * valid for guest mode (see nested_vmcb_checks). + * valid for guest mode (see nested_vmcb_check_save). */ cr0 = kvm_read_cr0(vcpu); if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) @@ -1205,27 +1268,48 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, /* * Validate host state saved from before VMRUN (see * nested_svm_check_permissions). - * TODO: validate reserved bits for all saved state. */ - if (!(save->cr0 & X86_CR0_PG)) + if (!(save->cr0 & X86_CR0_PG) || + !(save->cr0 & X86_CR0_PE) || + (save->rflags & X86_EFLAGS_VM) || + !nested_vmcb_valid_sregs(vcpu, save)) goto out_free; /* - * All checks done, we can enter guest mode. L1 control fields - * come from the nested save state. Guest state is already - * in the registers, the save area of the nested state instead - * contains saved L1 state. + * All checks done, we can enter guest mode. Userspace provides + * vmcb12.control, which will be combined with L1 and stored into + * vmcb02, and the L1 save state which we store in vmcb01. + * L2 registers if needed are moved from the current VMCB to VMCB02. */ svm->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); - copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); - hsave->save = *save; - svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; - load_nested_vmcb_control(svm, ctl); - nested_prepare_vmcb_control(svm); + if (svm->current_vmcb == &svm->vmcb01) + svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; + + svm->vmcb01.ptr->save.es = save->es; + svm->vmcb01.ptr->save.cs = save->cs; + svm->vmcb01.ptr->save.ss = save->ss; + svm->vmcb01.ptr->save.ds = save->ds; + svm->vmcb01.ptr->save.gdtr = save->gdtr; + svm->vmcb01.ptr->save.idtr = save->idtr; + svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED; + svm->vmcb01.ptr->save.efer = save->efer; + svm->vmcb01.ptr->save.cr0 = save->cr0; + svm->vmcb01.ptr->save.cr3 = save->cr3; + svm->vmcb01.ptr->save.cr4 = save->cr4; + svm->vmcb01.ptr->save.rax = save->rax; + svm->vmcb01.ptr->save.rsp = save->rsp; + svm->vmcb01.ptr->save.rip = save->rip; + svm->vmcb01.ptr->save.cpl = 0; + + nested_load_control_from_vmcb12(svm, ctl); + + svm_switch_vmcb(svm, &svm->nested.vmcb02); + + nested_vmcb02_prepare_control(svm); kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; @@ -1238,6 +1322,7 @@ out_free: struct kvm_x86_nested_ops svm_nested_ops = { .check_events = svm_check_nested_events, + .triple_fault = nested_svm_triple_fault, .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, .set_state = svm_set_nested_state, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 874ea309279f..83e00e524513 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1849,7 +1849,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn; vcpu->arch.regs[VCPU_REGS_RCX] = 0; - ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID); + ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID); if (!ret) { ret = -EINVAL; break; @@ -1899,8 +1899,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) return ret; } -int sev_handle_vmgexit(struct vcpu_svm *svm) +int sev_handle_vmgexit(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; u64 ghcb_gpa, exit_code; struct ghcb *ghcb; @@ -1912,13 +1913,13 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) return sev_handle_vmgexit_msr_protocol(svm); if (!ghcb_gpa) { - vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n"); + vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n"); return -EINVAL; } - if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) { + if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) { /* Unable to map GHCB from guest */ - vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", + vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", ghcb_gpa); return -EINVAL; } @@ -1926,7 +1927,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) svm->ghcb = svm->ghcb_map.hva; ghcb = svm->ghcb_map.hva; - trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb); + trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); exit_code = ghcb_get_sw_exit_code(ghcb); @@ -1944,7 +1945,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) if (!setup_vmgexit_scratch(svm, true, control->exit_info_2)) break; - ret = kvm_sev_es_mmio_read(&svm->vcpu, + ret = kvm_sev_es_mmio_read(vcpu, control->exit_info_1, control->exit_info_2, svm->ghcb_sa); @@ -1953,19 +1954,19 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) if (!setup_vmgexit_scratch(svm, false, control->exit_info_2)) break; - ret = kvm_sev_es_mmio_write(&svm->vcpu, + ret = kvm_sev_es_mmio_write(vcpu, control->exit_info_1, control->exit_info_2, svm->ghcb_sa); break; case SVM_VMGEXIT_NMI_COMPLETE: - ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET); + ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET); break; case SVM_VMGEXIT_AP_HLT_LOOP: - ret = kvm_emulate_ap_reset_hold(&svm->vcpu); + ret = kvm_emulate_ap_reset_hold(vcpu); break; case SVM_VMGEXIT_AP_JUMP_TABLE: { - struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; switch (control->exit_info_1) { case 0: @@ -1990,12 +1991,12 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) break; } case SVM_VMGEXIT_UNSUPPORTED_EVENT: - vcpu_unimpl(&svm->vcpu, + vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", control->exit_info_1, control->exit_info_2); break; default: - ret = svm_invoke_exit_handler(svm, exit_code); + ret = svm_invoke_exit_handler(vcpu, exit_code); } return ret; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 58a45bb139f8..271196400495 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -279,7 +279,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) * In this case we will return to the nested guest * as soon as we leave SMM. */ - if (!is_smm(&svm->vcpu)) + if (!is_smm(vcpu)) svm_free_nested(svm); } else { @@ -363,10 +363,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) bool has_error_code = vcpu->arch.exception.has_error_code; u32 error_code = vcpu->arch.exception.error_code; - kvm_deliver_exception_payload(&svm->vcpu); + kvm_deliver_exception_payload(vcpu); if (nr == BP_VECTOR && !nrips) { - unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); + unsigned long rip, old_rip = kvm_rip_read(vcpu); /* * For guest debugging where we have to reinject #BP if some @@ -375,8 +375,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) * raises a fault that is not intercepted. Still better than * failing in all cases. */ - (void)skip_emulated_instruction(&svm->vcpu); - rip = kvm_rip_read(&svm->vcpu); + (void)skip_emulated_instruction(vcpu); + rip = kvm_rip_read(vcpu); svm->int3_rip = rip + svm->vmcb->save.cs.base; svm->int3_injected = rip - old_rip; } @@ -881,7 +881,7 @@ static __init void svm_adjust_mmio_mask(void) */ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; - kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK); + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); } static void svm_hardware_teardown(void) @@ -1084,8 +1084,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) if (is_guest_mode(vcpu)) { /* Write L1's TSC offset. */ g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->nested.hsave->control.tsc_offset; - svm->nested.hsave->control.tsc_offset = offset; + svm->vmcb01.ptr->control.tsc_offset; + svm->vmcb01.ptr->control.tsc_offset = offset; } trace_kvm_write_tsc_offset(vcpu->vcpu_id, @@ -1113,12 +1113,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm) } } -static void init_vmcb(struct vcpu_svm *svm) +static void init_vmcb(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - svm->vcpu.arch.hflags = 0; + vcpu->arch.hflags = 0; svm_set_intercept(svm, INTERCEPT_CR0_READ); svm_set_intercept(svm, INTERCEPT_CR3_READ); @@ -1126,7 +1127,7 @@ static void init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, INTERCEPT_CR0_WRITE); svm_set_intercept(svm, INTERCEPT_CR3_WRITE); svm_set_intercept(svm, INTERCEPT_CR4_WRITE); - if (!kvm_vcpu_apicv_active(&svm->vcpu)) + if (!kvm_vcpu_apicv_active(vcpu)) svm_set_intercept(svm, INTERCEPT_CR8_WRITE); set_dr_intercepts(svm); @@ -1170,12 +1171,12 @@ static void init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, INTERCEPT_RDPRU); svm_set_intercept(svm, INTERCEPT_RSM); - if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { + if (!kvm_mwait_in_guest(vcpu->kvm)) { svm_set_intercept(svm, INTERCEPT_MONITOR); svm_set_intercept(svm, INTERCEPT_MWAIT); } - if (!kvm_hlt_in_guest(svm->vcpu.kvm)) + if (!kvm_hlt_in_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_HLT); control->iopm_base_pa = __sme_set(iopm_base); @@ -1201,19 +1202,19 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_cr4(&svm->vcpu, 0); - svm_set_efer(&svm->vcpu, 0); + svm_set_cr4(vcpu, 0); + svm_set_efer(vcpu, 0); save->dr6 = 0xffff0ff0; - kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED); + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); save->rip = 0x0000fff0; - svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; + vcpu->arch.regs[VCPU_REGS_RIP] = save->rip; /* * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. * It also updates the guest-visible cr0 value. */ - svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - kvm_mmu_reset_context(&svm->vcpu); + svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); + kvm_mmu_reset_context(vcpu); save->cr4 = X86_CR4_PAE; /* rdx = ?? */ @@ -1225,17 +1226,18 @@ static void init_vmcb(struct vcpu_svm *svm) clr_exception_intercept(svm, PF_VECTOR); svm_clr_intercept(svm, INTERCEPT_CR3_READ); svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); - save->g_pat = svm->vcpu.arch.pat; + save->g_pat = vcpu->arch.pat; save->cr3 = 0; save->cr4 = 0; } - svm->asid_generation = 0; + svm->current_vmcb->asid_generation = 0; svm->asid = 0; svm->nested.vmcb12_gpa = 0; - svm->vcpu.arch.hflags = 0; + svm->nested.last_vmcb12_gpa = 0; + vcpu->arch.hflags = 0; - if (!kvm_pause_in_guest(svm->vcpu.kvm)) { + if (!kvm_pause_in_guest(vcpu->kvm)) { control->pause_filter_count = pause_filter_count; if (pause_filter_thresh) control->pause_filter_thresh = pause_filter_thresh; @@ -1246,7 +1248,14 @@ static void init_vmcb(struct vcpu_svm *svm) svm_check_invpcid(svm); - if (kvm_vcpu_apicv_active(&svm->vcpu)) + /* + * If the host supports V_SPEC_CTRL then disable the interception + * of MSR_IA32_SPEC_CTRL. + */ + if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); + + if (kvm_vcpu_apicv_active(vcpu)) avic_init_vmcb(svm); /* @@ -1265,11 +1274,11 @@ static void init_vmcb(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } - if (sev_guest(svm->vcpu.kvm)) { + if (sev_guest(vcpu->kvm)) { svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; clr_exception_intercept(svm, UD_VECTOR); - if (sev_es_guest(svm->vcpu.kvm)) { + if (sev_es_guest(vcpu->kvm)) { /* Perform SEV-ES specific VMCB updates */ sev_es_init_vmcb(svm); } @@ -1291,12 +1300,12 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->virt_spec_ctrl = 0; if (!init_event) { - svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) - svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; } - init_vmcb(svm); + init_vmcb(vcpu); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false); kvm_rdx_write(vcpu, eax); @@ -1305,10 +1314,25 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } +void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) +{ + svm->current_vmcb = target_vmcb; + svm->vmcb = target_vmcb->ptr; + svm->vmcb_pa = target_vmcb->pa; + + /* + * Track the physical CPU the target_vmcb is running on + * in order to mark the VMCB dirty if the cpu changes at + * its next vmrun. + */ + + svm->current_vmcb->cpu = svm->vcpu.cpu; +} + static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; - struct page *vmcb_page; + struct page *vmcb01_page; struct page *vmsa_page = NULL; int err; @@ -1316,11 +1340,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); err = -ENOMEM; - vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!vmcb_page) + vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!vmcb01_page) goto out; - if (sev_es_guest(svm->vcpu.kvm)) { + if (sev_es_guest(vcpu->kvm)) { /* * SEV-ES guests require a separate VMSA page used to contain * the encrypted register state of the guest. @@ -1356,20 +1380,21 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm->vmcb = page_address(vmcb_page); - svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); + svm->vmcb01.ptr = page_address(vmcb01_page); + svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); if (vmsa_page) svm->vmsa = page_address(vmsa_page); - svm->asid_generation = 0; svm->guest_state_loaded = false; - init_vmcb(svm); + + svm_switch_vmcb(svm, &svm->vmcb01); + init_vmcb(vcpu); svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; - if (sev_es_guest(svm->vcpu.kvm)) + if (sev_es_guest(vcpu->kvm)) /* Perform SEV-ES specific VMCB creation updates */ sev_es_create_vcpu(svm); @@ -1379,7 +1404,7 @@ error_free_vmsa_page: if (vmsa_page) __free_page(vmsa_page); error_free_vmcb_page: - __free_page(vmcb_page); + __free_page(vmcb01_page); out: return err; } @@ -1407,7 +1432,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) sev_free_vcpu(vcpu); - __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); + __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); } @@ -1432,7 +1457,7 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) * Save additional host state that will be restored on VMEXIT (sev-es) * or subsequent vmload of host save area. */ - if (sev_es_guest(svm->vcpu.kvm)) { + if (sev_es_guest(vcpu->kvm)) { sev_es_prepare_guest_switch(svm, vcpu->cpu); } else { vmsave(__sme_page_pa(sd->save_area)); @@ -1476,11 +1501,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct vcpu_svm *svm = to_svm(vcpu); struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - if (unlikely(cpu != vcpu->cpu)) { - svm->asid_generation = 0; - vmcb_mark_all_dirty(svm->vmcb); - } - if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; indirect_branch_prediction_barrier(); @@ -1564,7 +1584,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm) /* Drop int_ctl fields related to VINTR injection. */ svm->vmcb->control.int_ctl &= mask; if (is_guest_mode(&svm->vcpu)) { - svm->nested.hsave->control.int_ctl &= mask; + svm->vmcb01.ptr->control.int_ctl &= mask; WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != (svm->nested.ctl.int_ctl & V_TPR_MASK)); @@ -1577,16 +1597,17 @@ static void svm_clear_vintr(struct vcpu_svm *svm) static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) { struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; + struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save; switch (seg) { case VCPU_SREG_CS: return &save->cs; case VCPU_SREG_DS: return &save->ds; case VCPU_SREG_ES: return &save->es; - case VCPU_SREG_FS: return &save->fs; - case VCPU_SREG_GS: return &save->gs; + case VCPU_SREG_FS: return &save01->fs; + case VCPU_SREG_GS: return &save01->gs; case VCPU_SREG_SS: return &save->ss; - case VCPU_SREG_TR: return &save->tr; - case VCPU_SREG_LDTR: return &save->ldtr; + case VCPU_SREG_TR: return &save01->tr; + case VCPU_SREG_LDTR: return &save01->ldtr; } BUG(); return NULL; @@ -1709,37 +1730,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) vmcb_mark_dirty(svm->vmcb, VMCB_DT); } -static void update_cr0_intercept(struct vcpu_svm *svm) -{ - ulong gcr0; - u64 *hcr0; - - /* - * SEV-ES guests must always keep the CR intercepts cleared. CR - * tracking is done using the CR write traps. - */ - if (sev_es_guest(svm->vcpu.kvm)) - return; - - gcr0 = svm->vcpu.arch.cr0; - hcr0 = &svm->vmcb->save.cr0; - *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) - | (gcr0 & SVM_CR0_SELECTIVE_MASK); - - vmcb_mark_dirty(svm->vmcb, VMCB_CR); - - if (gcr0 == *hcr0) { - svm_clr_intercept(svm, INTERCEPT_CR0_READ); - svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); - } else { - svm_set_intercept(svm, INTERCEPT_CR0_READ); - svm_set_intercept(svm, INTERCEPT_CR0_WRITE); - } -} - void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); + u64 hcr0 = cr0; #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { @@ -1757,7 +1751,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vcpu->arch.cr0 = cr0; if (!npt_enabled) - cr0 |= X86_CR0_PG | X86_CR0_WP; + hcr0 |= X86_CR0_PG | X86_CR0_WP; /* * re-enable caching here because the QEMU bios @@ -1765,10 +1759,26 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) * reboot */ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); - svm->vmcb->save.cr0 = cr0; + hcr0 &= ~(X86_CR0_CD | X86_CR0_NW); + + svm->vmcb->save.cr0 = hcr0; vmcb_mark_dirty(svm->vmcb, VMCB_CR); - update_cr0_intercept(svm); + + /* + * SEV-ES guests must always keep the CR intercepts cleared. CR + * tracking is done using the CR write traps. + */ + if (sev_es_guest(vcpu->kvm)) + return; + + if (hcr0 == cr0) { + /* Selective CR0 write remains on. */ + svm_clr_intercept(svm, INTERCEPT_CR0_READ); + svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); + } else { + svm_set_intercept(svm, INTERCEPT_CR0_READ); + svm_set_intercept(svm, INTERCEPT_CR0_WRITE); + } } static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1847,7 +1857,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } - svm->asid_generation = sd->asid_generation; + svm->current_vmcb->asid_generation = sd->asid_generation; svm->asid = sd->next_asid++; } @@ -1896,39 +1906,43 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) vmcb_mark_dirty(svm->vmcb, VMCB_DR); } -static int pf_interception(struct vcpu_svm *svm) +static int pf_interception(struct kvm_vcpu *vcpu) { - u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); + struct vcpu_svm *svm = to_svm(vcpu); + + u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; - return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, + return kvm_handle_page_fault(vcpu, error_code, fault_address, static_cpu_has(X86_FEATURE_DECODEASSISTS) ? svm->vmcb->control.insn_bytes : NULL, svm->vmcb->control.insn_len); } -static int npf_interception(struct vcpu_svm *svm) +static int npf_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); u64 error_code = svm->vmcb->control.exit_info_1; trace_kvm_page_fault(fault_address, error_code); - return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + return kvm_mmu_page_fault(vcpu, fault_address, error_code, static_cpu_has(X86_FEATURE_DECODEASSISTS) ? svm->vmcb->control.insn_bytes : NULL, svm->vmcb->control.insn_len); } -static int db_interception(struct vcpu_svm *svm) +static int db_interception(struct kvm_vcpu *vcpu) { - struct kvm_run *kvm_run = svm->vcpu.run; - struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_run *kvm_run = vcpu->run; + struct vcpu_svm *svm = to_svm(vcpu); - if (!(svm->vcpu.guest_debug & + if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && !svm->nmi_singlestep) { u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW; - kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload); + kvm_queue_exception_p(vcpu, DB_VECTOR, payload); return 1; } @@ -1938,7 +1952,7 @@ static int db_interception(struct vcpu_svm *svm) kvm_make_request(KVM_REQ_EVENT, vcpu); } - if (svm->vcpu.guest_debug & + if (vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; @@ -1952,9 +1966,10 @@ static int db_interception(struct vcpu_svm *svm) return 1; } -static int bp_interception(struct vcpu_svm *svm) +static int bp_interception(struct kvm_vcpu *vcpu) { - struct kvm_run *kvm_run = svm->vcpu.run; + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_run *kvm_run = vcpu->run; kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; @@ -1962,14 +1977,14 @@ static int bp_interception(struct vcpu_svm *svm) return 0; } -static int ud_interception(struct vcpu_svm *svm) +static int ud_interception(struct kvm_vcpu *vcpu) { - return handle_ud(&svm->vcpu); + return handle_ud(vcpu); } -static int ac_interception(struct vcpu_svm *svm) +static int ac_interception(struct kvm_vcpu *vcpu) { - kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0); + kvm_queue_exception_e(vcpu, AC_VECTOR, 0); return 1; } @@ -2012,7 +2027,7 @@ static bool is_erratum_383(void) return true; } -static void svm_handle_mce(struct vcpu_svm *svm) +static void svm_handle_mce(struct kvm_vcpu *vcpu) { if (is_erratum_383()) { /* @@ -2021,7 +2036,7 @@ static void svm_handle_mce(struct vcpu_svm *svm) */ pr_err("KVM: Guest triggered AMD Erratum 383\n"); - kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -2033,20 +2048,21 @@ static void svm_handle_mce(struct vcpu_svm *svm) kvm_machine_check(); } -static int mc_interception(struct vcpu_svm *svm) +static int mc_interception(struct kvm_vcpu *vcpu) { return 1; } -static int shutdown_interception(struct vcpu_svm *svm) +static int shutdown_interception(struct kvm_vcpu *vcpu) { - struct kvm_run *kvm_run = svm->vcpu.run; + struct kvm_run *kvm_run = vcpu->run; + struct vcpu_svm *svm = to_svm(vcpu); /* * The VM save area has already been encrypted so it * cannot be reinitialized - just terminate. */ - if (sev_es_guest(svm->vcpu.kvm)) + if (sev_es_guest(vcpu->kvm)) return -EINVAL; /* @@ -2054,20 +2070,20 @@ static int shutdown_interception(struct vcpu_svm *svm) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm); + init_vmcb(vcpu); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; } -static int io_interception(struct vcpu_svm *svm) +static int io_interception(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ int size, in, string; unsigned port; - ++svm->vcpu.stat.io_exits; + ++vcpu->stat.io_exits; string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; port = io_info >> 16; @@ -2082,93 +2098,67 @@ static int io_interception(struct vcpu_svm *svm) svm->next_rip = svm->vmcb->control.exit_info_2; - return kvm_fast_pio(&svm->vcpu, size, port, in); -} - -static int nmi_interception(struct vcpu_svm *svm) -{ - return 1; + return kvm_fast_pio(vcpu, size, port, in); } -static int intr_interception(struct vcpu_svm *svm) +static int nmi_interception(struct kvm_vcpu *vcpu) { - ++svm->vcpu.stat.irq_exits; return 1; } -static int nop_on_interception(struct vcpu_svm *svm) +static int intr_interception(struct kvm_vcpu *vcpu) { + ++vcpu->stat.irq_exits; return 1; } -static int halt_interception(struct vcpu_svm *svm) -{ - return kvm_emulate_halt(&svm->vcpu); -} - -static int vmmcall_interception(struct vcpu_svm *svm) +static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) { - return kvm_emulate_hypercall(&svm->vcpu); -} - -static int vmload_interception(struct vcpu_svm *svm) -{ - struct vmcb *nested_vmcb; + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb12; struct kvm_host_map map; int ret; - if (nested_svm_check_permissions(svm)) + if (nested_svm_check_permissions(vcpu)) return 1; - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); + ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); if (ret) { if (ret == -EINVAL) - kvm_inject_gp(&svm->vcpu, 0); + kvm_inject_gp(vcpu, 0); return 1; } - nested_vmcb = map.hva; + vmcb12 = map.hva; + + ret = kvm_skip_emulated_instruction(vcpu); - ret = kvm_skip_emulated_instruction(&svm->vcpu); + if (vmload) + nested_svm_vmloadsave(vmcb12, svm->vmcb); + else + nested_svm_vmloadsave(svm->vmcb, vmcb12); - nested_svm_vmloadsave(nested_vmcb, svm->vmcb); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map, true); return ret; } -static int vmsave_interception(struct vcpu_svm *svm) +static int vmload_interception(struct kvm_vcpu *vcpu) { - struct vmcb *nested_vmcb; - struct kvm_host_map map; - int ret; - - if (nested_svm_check_permissions(svm)) - return 1; - - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); - if (ret) { - if (ret == -EINVAL) - kvm_inject_gp(&svm->vcpu, 0); - return 1; - } - - nested_vmcb = map.hva; - - ret = kvm_skip_emulated_instruction(&svm->vcpu); - - nested_svm_vmloadsave(svm->vmcb, nested_vmcb); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + return vmload_vmsave_interception(vcpu, true); +} - return ret; +static int vmsave_interception(struct kvm_vcpu *vcpu) +{ + return vmload_vmsave_interception(vcpu, false); } -static int vmrun_interception(struct vcpu_svm *svm) +static int vmrun_interception(struct kvm_vcpu *vcpu) { - if (nested_svm_check_permissions(svm)) + if (nested_svm_check_permissions(vcpu)) return 1; - return nested_svm_vmrun(svm); + return nested_svm_vmrun(vcpu); } enum { @@ -2207,7 +2197,7 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, }; - int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = { + int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_INSTR_VMRUN] = vmrun_interception, [SVM_INSTR_VMLOAD] = vmload_interception, [SVM_INSTR_VMSAVE] = vmsave_interception, @@ -2216,17 +2206,13 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) int ret; if (is_guest_mode(vcpu)) { - svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode]; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - /* Returns '1' or -errno on failure, '0' on success. */ - ret = nested_svm_vmexit(svm); + ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]); if (ret) return ret; return 1; } - return svm_instr_handlers[opcode](svm); + return svm_instr_handlers[opcode](vcpu); } /* @@ -2237,9 +2223,9 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) * regions (e.g. SMM memory on host). * 2) VMware backdoor */ -static int gp_interception(struct vcpu_svm *svm) +static int gp_interception(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); u32 error_code = svm->vmcb->control.exit_info_1; int opcode; @@ -2304,73 +2290,52 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) } } -static int stgi_interception(struct vcpu_svm *svm) +static int stgi_interception(struct kvm_vcpu *vcpu) { int ret; - if (nested_svm_check_permissions(svm)) + if (nested_svm_check_permissions(vcpu)) return 1; - ret = kvm_skip_emulated_instruction(&svm->vcpu); - svm_set_gif(svm, true); + ret = kvm_skip_emulated_instruction(vcpu); + svm_set_gif(to_svm(vcpu), true); return ret; } -static int clgi_interception(struct vcpu_svm *svm) +static int clgi_interception(struct kvm_vcpu *vcpu) { int ret; - if (nested_svm_check_permissions(svm)) + if (nested_svm_check_permissions(vcpu)) return 1; - ret = kvm_skip_emulated_instruction(&svm->vcpu); - svm_set_gif(svm, false); + ret = kvm_skip_emulated_instruction(vcpu); + svm_set_gif(to_svm(vcpu), false); return ret; } -static int invlpga_interception(struct vcpu_svm *svm) +static int invlpga_interception(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; - - trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu), - kvm_rax_read(&svm->vcpu)); + trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, kvm_rcx_read(vcpu), + kvm_rax_read(vcpu)); /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ - kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu)); + kvm_mmu_invlpg(vcpu, kvm_rax_read(vcpu)); - return kvm_skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(vcpu); } -static int skinit_interception(struct vcpu_svm *svm) +static int skinit_interception(struct kvm_vcpu *vcpu) { - trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu)); + trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu)); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } -static int wbinvd_interception(struct vcpu_svm *svm) -{ - return kvm_emulate_wbinvd(&svm->vcpu); -} - -static int xsetbv_interception(struct vcpu_svm *svm) -{ - u64 new_bv = kvm_read_edx_eax(&svm->vcpu); - u32 index = kvm_rcx_read(&svm->vcpu); - - int err = kvm_set_xcr(&svm->vcpu, index, new_bv); - return kvm_complete_insn_gp(&svm->vcpu, err); -} - -static int rdpru_interception(struct vcpu_svm *svm) -{ - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; -} - -static int task_switch_interception(struct vcpu_svm *svm) +static int task_switch_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); u16 tss_selector; int reason; int int_type = svm->vmcb->control.exit_int_info & @@ -2399,7 +2364,7 @@ static int task_switch_interception(struct vcpu_svm *svm) if (reason == TASK_SWITCH_GATE) { switch (type) { case SVM_EXITINTINFO_TYPE_NMI: - svm->vcpu.arch.nmi_injected = false; + vcpu->arch.nmi_injected = false; break; case SVM_EXITINTINFO_TYPE_EXEPT: if (svm->vmcb->control.exit_info_2 & @@ -2408,10 +2373,10 @@ static int task_switch_interception(struct vcpu_svm *svm) error_code = (u32)svm->vmcb->control.exit_info_2; } - kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_exception_queue(vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: - kvm_clear_interrupt_queue(&svm->vcpu); + kvm_clear_interrupt_queue(vcpu); break; default: break; @@ -2422,77 +2387,58 @@ static int task_switch_interception(struct vcpu_svm *svm) int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { - if (!skip_emulated_instruction(&svm->vcpu)) + if (!skip_emulated_instruction(vcpu)) return 0; } if (int_type != SVM_EXITINTINFO_TYPE_SOFT) int_vec = -1; - return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, + return kvm_task_switch(vcpu, tss_selector, int_vec, reason, has_error_code, error_code); } -static int cpuid_interception(struct vcpu_svm *svm) +static int iret_interception(struct kvm_vcpu *vcpu) { - return kvm_emulate_cpuid(&svm->vcpu); -} + struct vcpu_svm *svm = to_svm(vcpu); -static int iret_interception(struct vcpu_svm *svm) -{ - ++svm->vcpu.stat.nmi_window_exits; - svm->vcpu.arch.hflags |= HF_IRET_MASK; - if (!sev_es_guest(svm->vcpu.kvm)) { + ++vcpu->stat.nmi_window_exits; + vcpu->arch.hflags |= HF_IRET_MASK; + if (!sev_es_guest(vcpu->kvm)) { svm_clr_intercept(svm, INTERCEPT_IRET); - svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + svm->nmi_iret_rip = kvm_rip_read(vcpu); } - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } -static int invd_interception(struct vcpu_svm *svm) -{ - /* Treat an INVD instruction as a NOP and just skip it. */ - return kvm_skip_emulated_instruction(&svm->vcpu); -} - -static int invlpg_interception(struct vcpu_svm *svm) +static int invlpg_interception(struct kvm_vcpu *vcpu) { if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return kvm_emulate_instruction(&svm->vcpu, 0); - - kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); - return kvm_skip_emulated_instruction(&svm->vcpu); -} + return kvm_emulate_instruction(vcpu, 0); -static int emulate_on_interception(struct vcpu_svm *svm) -{ - return kvm_emulate_instruction(&svm->vcpu, 0); + kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1); + return kvm_skip_emulated_instruction(vcpu); } -static int rsm_interception(struct vcpu_svm *svm) +static int emulate_on_interception(struct kvm_vcpu *vcpu) { - return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2); + return kvm_emulate_instruction(vcpu, 0); } -static int rdpmc_interception(struct vcpu_svm *svm) +static int rsm_interception(struct kvm_vcpu *vcpu) { - int err; - - if (!nrips) - return emulate_on_interception(svm); - - err = kvm_rdpmc(&svm->vcpu); - return kvm_complete_insn_gp(&svm->vcpu, err); + return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2); } -static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, +static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, unsigned long val) { - unsigned long cr0 = svm->vcpu.arch.cr0; + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long cr0 = vcpu->arch.cr0; bool ret = false; - if (!is_guest_mode(&svm->vcpu) || + if (!is_guest_mode(vcpu) || (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) return false; @@ -2509,17 +2455,18 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, #define CR_VALID (1ULL << 63) -static int cr_interception(struct vcpu_svm *svm) +static int cr_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); int reg, cr; unsigned long val; int err; if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_on_interception(svm); + return emulate_on_interception(vcpu); if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) - return emulate_on_interception(svm); + return emulate_on_interception(vcpu); reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) @@ -2530,61 +2477,61 @@ static int cr_interception(struct vcpu_svm *svm) err = 0; if (cr >= 16) { /* mov to cr */ cr -= 16; - val = kvm_register_read(&svm->vcpu, reg); + val = kvm_register_read(vcpu, reg); trace_kvm_cr_write(cr, val); switch (cr) { case 0: - if (!check_selective_cr0_intercepted(svm, val)) - err = kvm_set_cr0(&svm->vcpu, val); + if (!check_selective_cr0_intercepted(vcpu, val)) + err = kvm_set_cr0(vcpu, val); else return 1; break; case 3: - err = kvm_set_cr3(&svm->vcpu, val); + err = kvm_set_cr3(vcpu, val); break; case 4: - err = kvm_set_cr4(&svm->vcpu, val); + err = kvm_set_cr4(vcpu, val); break; case 8: - err = kvm_set_cr8(&svm->vcpu, val); + err = kvm_set_cr8(vcpu, val); break; default: WARN(1, "unhandled write to CR%d", cr); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } } else { /* mov from cr */ switch (cr) { case 0: - val = kvm_read_cr0(&svm->vcpu); + val = kvm_read_cr0(vcpu); break; case 2: - val = svm->vcpu.arch.cr2; + val = vcpu->arch.cr2; break; case 3: - val = kvm_read_cr3(&svm->vcpu); + val = kvm_read_cr3(vcpu); break; case 4: - val = kvm_read_cr4(&svm->vcpu); + val = kvm_read_cr4(vcpu); break; case 8: - val = kvm_get_cr8(&svm->vcpu); + val = kvm_get_cr8(vcpu); break; default: WARN(1, "unhandled read from CR%d", cr); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - kvm_register_write(&svm->vcpu, reg, val); + kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); } - return kvm_complete_insn_gp(&svm->vcpu, err); + return kvm_complete_insn_gp(vcpu, err); } -static int cr_trap(struct vcpu_svm *svm) +static int cr_trap(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); unsigned long old_value, new_value; unsigned int cr; int ret = 0; @@ -2606,7 +2553,7 @@ static int cr_trap(struct vcpu_svm *svm) kvm_post_set_cr4(vcpu, old_value, new_value); break; case 8: - ret = kvm_set_cr8(&svm->vcpu, new_value); + ret = kvm_set_cr8(vcpu, new_value); break; default: WARN(1, "unhandled CR%d write trap", cr); @@ -2617,57 +2564,57 @@ static int cr_trap(struct vcpu_svm *svm) return kvm_complete_insn_gp(vcpu, ret); } -static int dr_interception(struct vcpu_svm *svm) +static int dr_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); int reg, dr; unsigned long val; int err = 0; - if (svm->vcpu.guest_debug == 0) { + if (vcpu->guest_debug == 0) { /* * No more DR vmexits; force a reload of the debug registers * and reenter on this instruction. The next vmexit will * retrieve the full state of the debug registers. */ clr_dr_intercepts(svm); - svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; return 1; } if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_on_interception(svm); + return emulate_on_interception(vcpu); reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; if (dr >= 16) { /* mov to DRn */ dr -= 16; - val = kvm_register_read(&svm->vcpu, reg); - err = kvm_set_dr(&svm->vcpu, dr, val); + val = kvm_register_read(vcpu, reg); + err = kvm_set_dr(vcpu, dr, val); } else { - kvm_get_dr(&svm->vcpu, dr, &val); - kvm_register_write(&svm->vcpu, reg, val); + kvm_get_dr(vcpu, dr, &val); + kvm_register_write(vcpu, reg, val); } - return kvm_complete_insn_gp(&svm->vcpu, err); + return kvm_complete_insn_gp(vcpu, err); } -static int cr8_write_interception(struct vcpu_svm *svm) +static int cr8_write_interception(struct kvm_vcpu *vcpu) { - struct kvm_run *kvm_run = svm->vcpu.run; int r; - u8 cr8_prev = kvm_get_cr8(&svm->vcpu); + u8 cr8_prev = kvm_get_cr8(vcpu); /* instruction emulation calls kvm_set_cr8() */ - r = cr_interception(svm); - if (lapic_in_kernel(&svm->vcpu)) + r = cr_interception(vcpu); + if (lapic_in_kernel(vcpu)) return r; - if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) + if (cr8_prev <= kvm_get_cr8(vcpu)) return r; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; + vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } -static int efer_trap(struct vcpu_svm *svm) +static int efer_trap(struct kvm_vcpu *vcpu) { struct msr_data msr_info; int ret; @@ -2680,10 +2627,10 @@ static int efer_trap(struct vcpu_svm *svm) */ msr_info.host_initiated = false; msr_info.index = MSR_EFER; - msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME; - ret = kvm_set_msr_common(&svm->vcpu, &msr_info); + msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME; + ret = kvm_set_msr_common(vcpu, &msr_info); - return kvm_complete_insn_gp(&svm->vcpu, ret); + return kvm_complete_insn_gp(vcpu, ret); } static int svm_get_msr_feature(struct kvm_msr_entry *msr) @@ -2710,24 +2657,24 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_STAR: - msr_info->data = svm->vmcb->save.star; + msr_info->data = svm->vmcb01.ptr->save.star; break; #ifdef CONFIG_X86_64 case MSR_LSTAR: - msr_info->data = svm->vmcb->save.lstar; + msr_info->data = svm->vmcb01.ptr->save.lstar; break; case MSR_CSTAR: - msr_info->data = svm->vmcb->save.cstar; + msr_info->data = svm->vmcb01.ptr->save.cstar; break; case MSR_KERNEL_GS_BASE: - msr_info->data = svm->vmcb->save.kernel_gs_base; + msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; break; case MSR_SYSCALL_MASK: - msr_info->data = svm->vmcb->save.sfmask; + msr_info->data = svm->vmcb01.ptr->save.sfmask; break; #endif case MSR_IA32_SYSENTER_CS: - msr_info->data = svm->vmcb->save.sysenter_cs; + msr_info->data = svm->vmcb01.ptr->save.sysenter_cs; break; case MSR_IA32_SYSENTER_EIP: msr_info->data = svm->sysenter_eip; @@ -2771,7 +2718,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_has_spec_ctrl_msr(vcpu)) return 1; - msr_info->data = svm->spec_ctrl; + if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) + msr_info->data = svm->vmcb->save.spec_ctrl; + else + msr_info->data = svm->spec_ctrl; break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr_info->host_initiated && @@ -2809,8 +2759,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { struct vcpu_svm *svm = to_svm(vcpu); - if (!sev_es_guest(svm->vcpu.kvm) || !err) - return kvm_complete_insn_gp(&svm->vcpu, err); + if (!sev_es_guest(vcpu->kvm) || !err) + return kvm_complete_insn_gp(vcpu, err); ghcb_set_sw_exit_info_1(svm->ghcb, 1); ghcb_set_sw_exit_info_2(svm->ghcb, @@ -2820,11 +2770,6 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) return 1; } -static int rdmsr_interception(struct vcpu_svm *svm) -{ - return kvm_emulate_rdmsr(&svm->vcpu); -} - static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2861,7 +2806,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) return 1; vcpu->arch.pat = data; - svm->vmcb->save.g_pat = data; + svm->vmcb01.ptr->save.g_pat = data; + if (is_guest_mode(vcpu)) + nested_vmcb02_compute_g_pat(svm); vmcb_mark_dirty(svm->vmcb, VMCB_NPT); break; case MSR_IA32_SPEC_CTRL: @@ -2872,7 +2819,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (kvm_spec_ctrl_test_value(data)) return 1; - svm->spec_ctrl = data; + if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) + svm->vmcb->save.spec_ctrl = data; + else + svm->spec_ctrl = data; if (!data) break; @@ -2915,32 +2865,32 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->virt_spec_ctrl = data; break; case MSR_STAR: - svm->vmcb->save.star = data; + svm->vmcb01.ptr->save.star = data; break; #ifdef CONFIG_X86_64 case MSR_LSTAR: - svm->vmcb->save.lstar = data; + svm->vmcb01.ptr->save.lstar = data; break; case MSR_CSTAR: - svm->vmcb->save.cstar = data; + svm->vmcb01.ptr->save.cstar = data; break; case MSR_KERNEL_GS_BASE: - svm->vmcb->save.kernel_gs_base = data; + svm->vmcb01.ptr->save.kernel_gs_base = data; break; case MSR_SYSCALL_MASK: - svm->vmcb->save.sfmask = data; + svm->vmcb01.ptr->save.sfmask = data; break; #endif case MSR_IA32_SYSENTER_CS: - svm->vmcb->save.sysenter_cs = data; + svm->vmcb01.ptr->save.sysenter_cs = data; break; case MSR_IA32_SYSENTER_EIP: svm->sysenter_eip = data; - svm->vmcb->save.sysenter_eip = data; + svm->vmcb01.ptr->save.sysenter_eip = data; break; case MSR_IA32_SYSENTER_ESP: svm->sysenter_esp = data; - svm->vmcb->save.sysenter_esp = data; + svm->vmcb01.ptr->save.sysenter_esp = data; break; case MSR_TSC_AUX: if (!boot_cpu_has(X86_FEATURE_RDTSCP)) @@ -3006,38 +2956,32 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) return 0; } -static int wrmsr_interception(struct vcpu_svm *svm) +static int msr_interception(struct kvm_vcpu *vcpu) { - return kvm_emulate_wrmsr(&svm->vcpu); -} - -static int msr_interception(struct vcpu_svm *svm) -{ - if (svm->vmcb->control.exit_info_1) - return wrmsr_interception(svm); + if (to_svm(vcpu)->vmcb->control.exit_info_1) + return kvm_emulate_wrmsr(vcpu); else - return rdmsr_interception(svm); + return kvm_emulate_rdmsr(vcpu); } -static int interrupt_window_interception(struct vcpu_svm *svm) +static int interrupt_window_interception(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - svm_clear_vintr(svm); + kvm_make_request(KVM_REQ_EVENT, vcpu); + svm_clear_vintr(to_svm(vcpu)); /* * For AVIC, the only reason to end up here is ExtINTs. * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. */ - svm_toggle_avic_for_irq_window(&svm->vcpu, true); + svm_toggle_avic_for_irq_window(vcpu, true); - ++svm->vcpu.stat.irq_window_exits; + ++vcpu->stat.irq_window_exits; return 1; } -static int pause_interception(struct vcpu_svm *svm) +static int pause_interception(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; bool in_kernel; /* @@ -3045,35 +2989,18 @@ static int pause_interception(struct vcpu_svm *svm) * vcpu->arch.preempted_in_kernel can never be true. Just * set in_kernel to false as well. */ - in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0; + in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; if (!kvm_pause_in_guest(vcpu->kvm)) grow_ple_window(vcpu); kvm_vcpu_on_spin(vcpu, in_kernel); - return 1; -} - -static int nop_interception(struct vcpu_svm *svm) -{ - return kvm_skip_emulated_instruction(&(svm->vcpu)); + return kvm_skip_emulated_instruction(vcpu); } -static int monitor_interception(struct vcpu_svm *svm) +static int invpcid_interception(struct kvm_vcpu *vcpu) { - printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); - return nop_interception(svm); -} - -static int mwait_interception(struct vcpu_svm *svm) -{ - printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); - return nop_interception(svm); -} - -static int invpcid_interception(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); unsigned long type; gva_t gva; @@ -3098,7 +3025,7 @@ static int invpcid_interception(struct vcpu_svm *svm) return kvm_handle_invpcid(vcpu, type, gva); } -static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { +static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, [SVM_EXIT_READ_CR4] = cr_interception, @@ -3133,15 +3060,15 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, - [SVM_EXIT_SMI] = nop_on_interception, - [SVM_EXIT_INIT] = nop_on_interception, + [SVM_EXIT_SMI] = kvm_emulate_as_nop, + [SVM_EXIT_INIT] = kvm_emulate_as_nop, [SVM_EXIT_VINTR] = interrupt_window_interception, - [SVM_EXIT_RDPMC] = rdpmc_interception, - [SVM_EXIT_CPUID] = cpuid_interception, + [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, + [SVM_EXIT_CPUID] = kvm_emulate_cpuid, [SVM_EXIT_IRET] = iret_interception, - [SVM_EXIT_INVD] = invd_interception, + [SVM_EXIT_INVD] = kvm_emulate_invd, [SVM_EXIT_PAUSE] = pause_interception, - [SVM_EXIT_HLT] = halt_interception, + [SVM_EXIT_HLT] = kvm_emulate_halt, [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invlpga_interception, [SVM_EXIT_IOIO] = io_interception, @@ -3149,17 +3076,17 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, [SVM_EXIT_VMRUN] = vmrun_interception, - [SVM_EXIT_VMMCALL] = vmmcall_interception, + [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall, [SVM_EXIT_VMLOAD] = vmload_interception, [SVM_EXIT_VMSAVE] = vmsave_interception, [SVM_EXIT_STGI] = stgi_interception, [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = skinit_interception, - [SVM_EXIT_WBINVD] = wbinvd_interception, - [SVM_EXIT_MONITOR] = monitor_interception, - [SVM_EXIT_MWAIT] = mwait_interception, - [SVM_EXIT_XSETBV] = xsetbv_interception, - [SVM_EXIT_RDPRU] = rdpru_interception, + [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, + [SVM_EXIT_MONITOR] = kvm_emulate_monitor, + [SVM_EXIT_MWAIT] = kvm_emulate_mwait, + [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv, + [SVM_EXIT_RDPRU] = kvm_handle_invalid_op, [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, @@ -3177,6 +3104,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; + struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); @@ -3239,28 +3167,28 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) save->ds.limit, save->ds.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "fs:", - save->fs.selector, save->fs.attrib, - save->fs.limit, save->fs.base); + save01->fs.selector, save01->fs.attrib, + save01->fs.limit, save01->fs.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "gs:", - save->gs.selector, save->gs.attrib, - save->gs.limit, save->gs.base); + save01->gs.selector, save01->gs.attrib, + save01->gs.limit, save01->gs.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "gdtr:", save->gdtr.selector, save->gdtr.attrib, save->gdtr.limit, save->gdtr.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "ldtr:", - save->ldtr.selector, save->ldtr.attrib, - save->ldtr.limit, save->ldtr.base); + save01->ldtr.selector, save01->ldtr.attrib, + save01->ldtr.limit, save01->ldtr.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "idtr:", save->idtr.selector, save->idtr.attrib, save->idtr.limit, save->idtr.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "tr:", - save->tr.selector, save->tr.attrib, - save->tr.limit, save->tr.base); + save01->tr.selector, save01->tr.attrib, + save01->tr.limit, save01->tr.base); pr_err("cpl: %d efer: %016llx\n", save->cpl, save->efer); pr_err("%-15s %016llx %-13s %016llx\n", @@ -3274,15 +3202,15 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "rsp:", save->rsp, "rax:", save->rax); pr_err("%-15s %016llx %-13s %016llx\n", - "star:", save->star, "lstar:", save->lstar); + "star:", save01->star, "lstar:", save01->lstar); pr_err("%-15s %016llx %-13s %016llx\n", - "cstar:", save->cstar, "sfmask:", save->sfmask); + "cstar:", save01->cstar, "sfmask:", save01->sfmask); pr_err("%-15s %016llx %-13s %016llx\n", - "kernel_gs_base:", save->kernel_gs_base, - "sysenter_cs:", save->sysenter_cs); + "kernel_gs_base:", save01->kernel_gs_base, + "sysenter_cs:", save01->sysenter_cs); pr_err("%-15s %016llx %-13s %016llx\n", - "sysenter_esp:", save->sysenter_esp, - "sysenter_eip:", save->sysenter_eip); + "sysenter_esp:", save01->sysenter_esp, + "sysenter_eip:", save01->sysenter_eip); pr_err("%-15s %016llx %-13s %016llx\n", "gpat:", save->g_pat, "dbgctl:", save->dbgctl); pr_err("%-15s %016llx %-13s %016llx\n", @@ -3309,24 +3237,24 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) return -EINVAL; } -int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code) +int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (svm_handle_invalid_exit(&svm->vcpu, exit_code)) + if (svm_handle_invalid_exit(vcpu, exit_code)) return 0; #ifdef CONFIG_RETPOLINE if (exit_code == SVM_EXIT_MSR) - return msr_interception(svm); + return msr_interception(vcpu); else if (exit_code == SVM_EXIT_VINTR) - return interrupt_window_interception(svm); + return interrupt_window_interception(vcpu); else if (exit_code == SVM_EXIT_INTR) - return intr_interception(svm); + return intr_interception(vcpu); else if (exit_code == SVM_EXIT_HLT) - return halt_interception(svm); + return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) - return npf_interception(svm); + return npf_interception(vcpu); #endif - return svm_exit_handlers[exit_code](svm); + return svm_exit_handlers[exit_code](vcpu); } static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, @@ -3395,7 +3323,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - return svm_invoke_exit_handler(svm, exit_code); + return svm_invoke_exit_handler(vcpu, exit_code); } static void reload_tss(struct kvm_vcpu *vcpu) @@ -3406,15 +3334,28 @@ static void reload_tss(struct kvm_vcpu *vcpu) load_TR_desc(); } -static void pre_svm_run(struct vcpu_svm *svm) +static void pre_svm_run(struct kvm_vcpu *vcpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu); + struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If the previous vmrun of the vmcb occurred on + * a different physical cpu then we must mark the vmcb dirty. + * and assign a new asid. + */ + + if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) { + svm->current_vmcb->asid_generation = 0; + vmcb_mark_all_dirty(svm->vmcb); + svm->current_vmcb->cpu = vcpu->cpu; + } - if (sev_guest(svm->vcpu.kvm)) - return pre_sev_run(svm, svm->vcpu.cpu); + if (sev_guest(vcpu->kvm)) + return pre_sev_run(svm, vcpu->cpu); /* FIXME: handle wraparound of asid_generation */ - if (svm->asid_generation != sd->asid_generation) + if (svm->current_vmcb->asid_generation != sd->asid_generation) new_asid(svm, sd); } @@ -3424,7 +3365,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; - if (!sev_es_guest(svm->vcpu.kvm)) + if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } @@ -3478,7 +3419,7 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) return false; ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - (svm->vcpu.arch.hflags & HF_NMI_MASK); + (vcpu->arch.hflags & HF_NMI_MASK); return ret; } @@ -3498,9 +3439,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - - return !!(svm->vcpu.arch.hflags & HF_NMI_MASK); + return !!(vcpu->arch.hflags & HF_NMI_MASK); } static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) @@ -3508,12 +3447,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) struct vcpu_svm *svm = to_svm(vcpu); if (masked) { - svm->vcpu.arch.hflags |= HF_NMI_MASK; - if (!sev_es_guest(svm->vcpu.kvm)) + vcpu->arch.hflags |= HF_NMI_MASK; + if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); } else { - svm->vcpu.arch.hflags &= ~HF_NMI_MASK; - if (!sev_es_guest(svm->vcpu.kvm)) + vcpu->arch.hflags &= ~HF_NMI_MASK; + if (!sev_es_guest(vcpu->kvm)) svm_clr_intercept(svm, INTERCEPT_IRET); } } @@ -3526,7 +3465,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (!gif_set(svm)) return true; - if (sev_es_guest(svm->vcpu.kvm)) { + if (sev_es_guest(vcpu->kvm)) { /* * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask * bit to determine the state of the IF flag. @@ -3536,7 +3475,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) } else if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) - ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF) + ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) return true; @@ -3595,8 +3534,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) - == HF_NMI_MASK) + if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) return; /* IRET will cause a vm exit */ if (!gif_set(svm)) { @@ -3638,7 +3576,7 @@ void svm_flush_tlb(struct kvm_vcpu *vcpu) if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; else - svm->asid_generation--; + svm->current_vmcb->asid_generation--; } static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) @@ -3675,8 +3613,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } -static void svm_complete_interrupts(struct vcpu_svm *svm) +static void svm_complete_interrupts(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; @@ -3688,28 +3627,28 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) * If we've made progress since setting HF_IRET_MASK, we've * executed an IRET and can allow NMI injection. */ - if ((svm->vcpu.arch.hflags & HF_IRET_MASK) && - (sev_es_guest(svm->vcpu.kvm) || - kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) { - svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + if ((vcpu->arch.hflags & HF_IRET_MASK) && + (sev_es_guest(vcpu->kvm) || + kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { + vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + kvm_make_request(KVM_REQ_EVENT, vcpu); } - svm->vcpu.arch.nmi_injected = false; - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); if (!(exitintinfo & SVM_EXITINTINFO_VALID)) return; - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; switch (type) { case SVM_EXITINTINFO_TYPE_NMI: - svm->vcpu.arch.nmi_injected = true; + vcpu->arch.nmi_injected = true; break; case SVM_EXITINTINFO_TYPE_EXEPT: /* @@ -3725,21 +3664,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) */ if (kvm_exception_is_soft(vector)) { if (vector == BP_VECTOR && int3_injected && - kvm_is_linear_rip(&svm->vcpu, svm->int3_rip)) - kvm_rip_write(&svm->vcpu, - kvm_rip_read(&svm->vcpu) - - int3_injected); + kvm_is_linear_rip(vcpu, svm->int3_rip)) + kvm_rip_write(vcpu, + kvm_rip_read(vcpu) - int3_injected); break; } if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(&svm->vcpu, vector, err); + kvm_requeue_exception_e(vcpu, vector, err); } else - kvm_requeue_exception(&svm->vcpu, vector); + kvm_requeue_exception(vcpu, vector); break; case SVM_EXITINTINFO_TYPE_INTR: - kvm_queue_interrupt(&svm->vcpu, vector, false); + kvm_queue_interrupt(vcpu, vector, false); break; default: break; @@ -3754,7 +3692,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) control->exit_int_info = control->event_inj; control->exit_int_info_err = control->event_inj_err; control->event_inj = 0; - svm_complete_interrupts(svm); + svm_complete_interrupts(vcpu); } static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) @@ -3766,9 +3704,10 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } -static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, - struct vcpu_svm *svm) +static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + /* * VMENTER enables interrupts (host state), but the kernel state is * interrupts disabled when this is invoked. Also tell RCU about @@ -3789,12 +3728,14 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); - if (sev_es_guest(svm->vcpu.kvm)) { + if (sev_es_guest(vcpu->kvm)) { __svm_sev_es_vcpu_run(svm->vmcb_pa); } else { struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); - __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); + vmload(svm->vmcb01.pa); + __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&vcpu->arch.regs); + vmsave(svm->vmcb01.pa); vmload(__sme_page_pa(sd->save_area)); } @@ -3845,7 +3786,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) smp_send_reschedule(vcpu->cpu); } - pre_svm_run(svm); + pre_svm_run(vcpu); sync_lapic_to_cr8(vcpu); @@ -3859,7 +3800,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) svm_set_dr6(svm, vcpu->arch.dr6); else svm_set_dr6(svm, DR6_ACTIVE_LOW); @@ -3875,9 +3816,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * is no need to worry about the conditional branch over the wrmsr * being speculatively taken. */ - x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); + if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) + x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); - svm_vcpu_enter_exit(vcpu, svm); + svm_vcpu_enter_exit(vcpu); /* * We do not use IBRS in the kernel. If this vCPU has used the @@ -3894,15 +3836,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) && + unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - if (!sev_es_guest(svm->vcpu.kvm)) + if (!sev_es_guest(vcpu->kvm)) reload_tss(vcpu); - x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); + if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) + x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); - if (!sev_es_guest(svm->vcpu.kvm)) { + if (!sev_es_guest(vcpu->kvm)) { vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; @@ -3910,7 +3854,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) } if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_before_interrupt(&svm->vcpu); + kvm_before_interrupt(vcpu); kvm_load_host_xsave_state(vcpu); stgi(); @@ -3918,13 +3862,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) /* Any pending NMI will happen here */ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_after_interrupt(&svm->vcpu); + kvm_after_interrupt(vcpu); sync_cr8_to_lapic(vcpu); svm->next_rip = 0; - if (is_guest_mode(&svm->vcpu)) { - sync_nested_vmcb_control(svm); + if (is_guest_mode(vcpu)) { + nested_sync_control_from_vmcb02(svm); svm->nested.nested_run_pending = 0; } @@ -3933,7 +3877,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->vcpu.arch.apf.host_apf_flags = + vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); if (npt_enabled) { @@ -3947,9 +3891,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) */ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + MC_VECTOR)) - svm_handle_mce(svm); + svm_handle_mce(vcpu); - svm_complete_interrupts(svm); + svm_complete_interrupts(vcpu); if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; @@ -3957,21 +3901,26 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) return svm_exit_handlers_fastpath(vcpu); } -static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root, +static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct vcpu_svm *svm = to_svm(vcpu); unsigned long cr3; - cr3 = __sme_set(root); if (npt_enabled) { - svm->vmcb->control.nested_cr3 = cr3; + svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); vmcb_mark_dirty(svm->vmcb, VMCB_NPT); /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) return; cr3 = vcpu->arch.cr3; + } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { + cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); + } else { + /* PCID in the guest should be impossible with a 32-bit MMU. */ + WARN_ON_ONCE(kvm_get_active_pcid(vcpu)); + cr3 = root_hpa; } svm->vmcb->save.cr3 = cr3; @@ -4048,7 +3997,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* Update nrips enabled cache */ svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && - guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); + guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); /* Check again if INVPCID interception if required */ svm_check_invpcid(svm); @@ -4349,15 +4298,15 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (!(saved_efer & EFER_SVME)) return 1; - if (kvm_vcpu_map(&svm->vcpu, + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) return 1; if (svm_allocate_nested(svm)) return 1; - ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva); + kvm_vcpu_unmap(vcpu, &map, true); } } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 39e071fdab0c..8e276c4fb33d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -81,11 +81,19 @@ struct kvm_svm { struct kvm_vcpu; +struct kvm_vmcb_info { + struct vmcb *ptr; + unsigned long pa; + int cpu; + uint64_t asid_generation; +}; + struct svm_nested_state { - struct vmcb *hsave; + struct kvm_vmcb_info vmcb02; u64 hsave_msr; u64 vm_cr_msr; u64 vmcb12_gpa; + u64 last_vmcb12_gpa; /* These are the merged vectors */ u32 *msrpm; @@ -104,9 +112,10 @@ struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; unsigned long vmcb_pa; + struct kvm_vmcb_info vmcb01; + struct kvm_vmcb_info *current_vmcb; struct svm_cpu_data *svm_data; u32 asid; - uint64_t asid_generation; uint64_t sysenter_esp; uint64_t sysenter_eip; uint64_t tsc_aux; @@ -239,17 +248,14 @@ static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) vmcb->control.clean &= ~(1 << bit); } -static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) +static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit) { - return container_of(vcpu, struct vcpu_svm, vcpu); + return !test_bit(bit, (unsigned long *)&vmcb->control.clean); } -static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) +static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { - if (is_guest_mode(&svm->vcpu)) - return svm->nested.hsave; - else - return svm->vmcb; + return container_of(vcpu, struct vcpu_svm, vcpu); } static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) @@ -272,7 +278,7 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) static inline void set_dr_intercepts(struct vcpu_svm *svm) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; if (!sev_es_guest(svm->vcpu.kvm)) { vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); @@ -299,7 +305,7 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm) static inline void clr_dr_intercepts(struct vcpu_svm *svm) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; vmcb->control.intercepts[INTERCEPT_DR] = 0; @@ -314,7 +320,7 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm) static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; WARN_ON_ONCE(bit >= 32); vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); @@ -324,7 +330,7 @@ static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; WARN_ON_ONCE(bit >= 32); vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); @@ -334,7 +340,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit) static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; vmcb_set_intercept(&vmcb->control, bit); @@ -343,7 +349,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; vmcb_clr_intercept(&vmcb->control, bit); @@ -405,7 +411,7 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu); bool svm_nmi_blocked(struct kvm_vcpu *vcpu); bool svm_interrupt_blocked(struct kvm_vcpu *vcpu); void svm_set_gif(struct vcpu_svm *svm, bool value); -int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code); +int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); @@ -437,20 +443,30 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } -int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb); +int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12); void svm_leave_nested(struct vcpu_svm *svm); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); -int nested_svm_vmrun(struct vcpu_svm *svm); +int nested_svm_vmrun(struct kvm_vcpu *vcpu); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); + +static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) +{ + svm->vmcb->control.exit_code = exit_code; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + return nested_svm_vmexit(svm); +} + int nested_svm_exit_handled(struct vcpu_svm *svm); -int nested_svm_check_permissions(struct vcpu_svm *svm); +int nested_svm_check_permissions(struct kvm_vcpu *vcpu); int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); -void sync_nested_vmcb_control(struct vcpu_svm *svm); +void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); +void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm); +void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); extern struct kvm_x86_nested_ops svm_nested_ops; @@ -491,8 +507,8 @@ void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm); void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); -int avic_incomplete_ipi_interception(struct vcpu_svm *svm); -int avic_unaccelerated_access_interception(struct vcpu_svm *svm); +int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); +int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); @@ -565,7 +581,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu); void __init sev_hardware_setup(void); void sev_hardware_teardown(void); void sev_free_vcpu(struct kvm_vcpu *vcpu); -int sev_handle_vmgexit(struct vcpu_svm *svm); +int sev_handle_vmgexit(struct kvm_vcpu *vcpu); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_create_vcpu(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 6feb8c08f45a..4fa17df123cd 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -79,28 +79,10 @@ SYM_FUNC_START(__svm_vcpu_run) /* Enter guest mode */ sti -1: vmload %_ASM_AX - jmp 3f -2: cmpb $0, kvm_rebooting - jne 3f - ud2 - _ASM_EXTABLE(1b, 2b) -3: vmrun %_ASM_AX - jmp 5f -4: cmpb $0, kvm_rebooting - jne 5f - ud2 - _ASM_EXTABLE(3b, 4b) +1: vmrun %_ASM_AX -5: vmsave %_ASM_AX - jmp 7f -6: cmpb $0, kvm_rebooting - jne 7f - ud2 - _ASM_EXTABLE(5b, 6b) -7: - cli +2: cli #ifdef CONFIG_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ @@ -167,6 +149,13 @@ SYM_FUNC_START(__svm_vcpu_run) #endif pop %_ASM_BP ret + +3: cmpb $0, kvm_rebooting + jne 2b + ud2 + + _ASM_EXTABLE(1b, 3b) + SYM_FUNC_END(__svm_vcpu_run) /** @@ -186,18 +175,15 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) #endif push %_ASM_BX - /* Enter guest mode */ + /* Move @vmcb to RAX. */ mov %_ASM_ARG1, %_ASM_AX + + /* Enter guest mode */ sti 1: vmrun %_ASM_AX - jmp 3f -2: cmpb $0, kvm_rebooting - jne 3f - ud2 - _ASM_EXTABLE(1b, 2b) -3: cli +2: cli #ifdef CONFIG_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ @@ -217,4 +203,11 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) #endif pop %_ASM_BP ret + +3: cmpb $0, kvm_rebooting + jne 2b + ud2 + + _ASM_EXTABLE(1b, 3b) + SYM_FUNC_END(__svm_sev_es_vcpu_run) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bcca0b80e0d0..fd334e4aa6db 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -21,13 +21,7 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested_early_check = 0; module_param(nested_early_check, bool, S_IRUGO); -#define CC(consistency_check) \ -({ \ - bool failed = (consistency_check); \ - if (failed) \ - trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ - failed; \ -}) +#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK /* * Hyper-V requires all of these, so mark them as supported even though @@ -3453,6 +3447,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); enum nested_evmptrld_status evmptrld_status; + ++vcpu->stat.nested_run; + if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4422,6 +4418,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + /* Similarly, triple faults in L2 should never escape. */ + WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* Service the TLB flush request for L2 before switching to L1. */ @@ -4558,6 +4557,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx->fail = 0; } +static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) +{ + nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); +} + /* * Decode the memory-address operand of a vmx instruction, as recorded on an * exit caused by such an instruction (run by a guest hypervisor). @@ -5479,16 +5483,11 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, if (!nested_vmx_check_eptp(vcpu, new_eptp)) return 1; - kvm_mmu_unload(vcpu); mmu->ept_ad = accessed_dirty; mmu->mmu_role.base.ad_disabled = !accessed_dirty; vmcs12->ept_pointer = new_eptp; - /* - * TODO: Check what's the correct approach in case - * mmu reload fails. Currently, we just let the next - * reload potentially fail - */ - kvm_mmu_reload(vcpu); + + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); } return 0; @@ -6599,6 +6598,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) struct kvm_x86_nested_ops vmx_nested_ops = { .check_events = vmx_check_nested_events, .hv_timer_pending = nested_vmx_preemption_timer_pending, + .triple_fault = nested_vmx_triple_fault, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, .get_nested_state_pages = vmx_get_nested_state_pages, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 32cf8287d4a7..c8a4a548e96b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -472,26 +472,6 @@ static const u32 vmx_uret_msrs_list[] = { static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); -/* check_ept_pointer() should be under protection of ept_pointer_lock. */ -static void check_ept_pointer_match(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - u64 tmp_eptp = INVALID_PAGE; - int i; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!VALID_PAGE(tmp_eptp)) { - tmp_eptp = to_vmx(vcpu)->ept_pointer; - } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { - to_kvm_vmx(kvm)->ept_pointers_match - = EPT_POINTERS_MISMATCH; - return; - } - } - - to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; -} - static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, void *data) { @@ -501,47 +481,70 @@ static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush range->pages); } -static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm, - struct kvm_vcpu *vcpu, struct kvm_tlb_range *range) +static inline int hv_remote_flush_root_ept(hpa_t root_ept, + struct kvm_tlb_range *range) { - u64 ept_pointer = to_vmx(vcpu)->ept_pointer; - - /* - * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address - * of the base of EPT PML4 table, strip off EPT configuration - * information. - */ if (range) - return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK, + return hyperv_flush_guest_mapping_range(root_ept, kvm_fill_hv_flush_list_func, (void *)range); else - return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK); + return hyperv_flush_guest_mapping(root_ept); } static int hv_remote_flush_tlb_with_range(struct kvm *kvm, struct kvm_tlb_range *range) { + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); struct kvm_vcpu *vcpu; - int ret = 0, i; + int ret = 0, i, nr_unique_valid_roots; + hpa_t root; - spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + spin_lock(&kvm_vmx->hv_root_ept_lock); - if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) - check_ept_pointer_match(kvm); + if (!VALID_PAGE(kvm_vmx->hv_root_ept)) { + nr_unique_valid_roots = 0; - if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { + /* + * Flush all valid roots, and see if all vCPUs have converged + * on a common root, in which case future flushes can skip the + * loop and flush the common root. + */ kvm_for_each_vcpu(i, vcpu, kvm) { - /* If ept_pointer is invalid pointer, bypass flush request. */ - if (VALID_PAGE(to_vmx(vcpu)->ept_pointer)) - ret |= __hv_remote_flush_tlb_with_range( - kvm, vcpu, range); + root = to_vmx(vcpu)->hv_root_ept; + if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept) + continue; + + /* + * Set the tracked root to the first valid root. Keep + * this root for the entirety of the loop even if more + * roots are encountered as a low effort optimization + * to avoid flushing the same (first) root again. + */ + if (++nr_unique_valid_roots == 1) + kvm_vmx->hv_root_ept = root; + + if (!ret) + ret = hv_remote_flush_root_ept(root, range); + + /* + * Stop processing roots if a failure occurred and + * multiple valid roots have already been detected. + */ + if (ret && nr_unique_valid_roots > 1) + break; } + + /* + * The optimized flush of a single root can't be used if there + * are multiple valid roots (obviously). + */ + if (nr_unique_valid_roots > 1) + kvm_vmx->hv_root_ept = INVALID_PAGE; } else { - ret = __hv_remote_flush_tlb_with_range(kvm, - kvm_get_vcpu(kvm, 0), range); + ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range); } - spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + spin_unlock(&kvm_vmx->hv_root_ept_lock); return ret; } static int hv_remote_flush_tlb(struct kvm *kvm) @@ -576,6 +579,21 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) #endif /* IS_ENABLED(CONFIG_HYPERV) */ +static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept) +{ +#if IS_ENABLED(CONFIG_HYPERV) + struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); + + if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { + spin_lock(&kvm_vmx->hv_root_ept_lock); + to_vmx(vcpu)->hv_root_ept = root_ept; + if (root_ept != kvm_vmx->hv_root_ept) + kvm_vmx->hv_root_ept = INVALID_PAGE; + spin_unlock(&kvm_vmx->hv_root_ept_lock); + } +#endif +} + /* * Comment's format: document - errata name - stepping - processor name. * Refer from @@ -3088,8 +3106,7 @@ static int vmx_get_max_tdp_level(void) return 4; } -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, - int root_level) +u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { u64 eptp = VMX_EPTP_MT_WB; @@ -3098,13 +3115,13 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, if (enable_ept_ad_bits && (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) eptp |= VMX_EPTP_AD_ENABLE_BIT; - eptp |= (root_hpa & PAGE_MASK); + eptp |= root_hpa; return eptp; } -static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, - int pgd_level) +static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int root_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3112,16 +3129,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, u64 eptp; if (enable_ept) { - eptp = construct_eptp(vcpu, pgd, pgd_level); + eptp = construct_eptp(vcpu, root_hpa, root_level); vmcs_write64(EPT_POINTER, eptp); - if (kvm_x86_ops.tlb_remote_flush) { - spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); - to_vmx(vcpu)->ept_pointer = eptp; - to_kvm_vmx(kvm)->ept_pointers_match - = EPT_POINTERS_CHECK; - spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); - } + hv_track_root_ept(vcpu, root_hpa); if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; @@ -3131,7 +3142,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { - guest_cr3 = pgd; + guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu); } if (update_guest_cr3) @@ -4314,15 +4325,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) vmx->secondary_exec_control = exec_control; } -static void ept_set_mmio_spte_mask(void) -{ - /* - * EPT Misconfigurations can be generated if the value of bits 2:0 - * of an EPT paging-structure entry is 110b (write/execute). - */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0); -} - #define VMX_XSS_EXIT_BITMAP 0 /* @@ -5184,17 +5186,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) return 1; } -static int handle_vmcall(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_hypercall(vcpu); -} - -static int handle_invd(struct kvm_vcpu *vcpu) -{ - /* Treat an INVD instruction as a NOP and just skip it. */ - return kvm_skip_emulated_instruction(vcpu); -} - static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); @@ -5203,28 +5194,6 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -static int handle_rdpmc(struct kvm_vcpu *vcpu) -{ - int err; - - err = kvm_rdpmc(vcpu); - return kvm_complete_insn_gp(vcpu, err); -} - -static int handle_wbinvd(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_wbinvd(vcpu); -} - -static int handle_xsetbv(struct kvm_vcpu *vcpu) -{ - u64 new_bv = kvm_read_edx_eax(vcpu); - u32 index = kvm_rcx_read(vcpu); - - int err = kvm_set_xcr(vcpu, index, new_bv); - return kvm_complete_insn_gp(vcpu, err); -} - static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { @@ -5485,18 +5454,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -static void vmx_enable_tdp(void) -{ - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, - enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - VMX_EPT_RWX_MASK, 0ull); - - ept_set_mmio_spte_mask(); -} - /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -5516,34 +5473,11 @@ static int handle_pause(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -static int handle_nop(struct kvm_vcpu *vcpu) -{ - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_mwait(struct kvm_vcpu *vcpu) -{ - printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); - return handle_nop(vcpu); -} - -static int handle_invalid_op(struct kvm_vcpu *vcpu) -{ - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; -} - static int handle_monitor_trap(struct kvm_vcpu *vcpu) { return 1; } -static int handle_monitor(struct kvm_vcpu *vcpu) -{ - printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); - return handle_nop(vcpu); -} - static int handle_invpcid(struct kvm_vcpu *vcpu) { u32 vmx_instruction_info; @@ -5668,10 +5602,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, [EXIT_REASON_HLT] = kvm_emulate_halt, - [EXIT_REASON_INVD] = handle_invd, + [EXIT_REASON_INVD] = kvm_emulate_invd, [EXIT_REASON_INVLPG] = handle_invlpg, - [EXIT_REASON_RDPMC] = handle_rdpmc, - [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, + [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, @@ -5685,8 +5619,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_APIC_WRITE] = handle_apic_write, [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, - [EXIT_REASON_WBINVD] = handle_wbinvd, - [EXIT_REASON_XSETBV] = handle_xsetbv, + [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, + [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_GDTR_IDTR] = handle_desc, @@ -5694,13 +5628,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, + [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, [EXIT_REASON_INVEPT] = handle_vmx_instruction, [EXIT_REASON_INVVPID] = handle_vmx_instruction, - [EXIT_REASON_RDRAND] = handle_invalid_op, - [EXIT_REASON_RDSEED] = handle_invalid_op, + [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, + [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, @@ -6989,8 +6923,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx->pi_desc.nv = POSTED_INTR_VECTOR; vmx->pi_desc.sn = 1; - vmx->ept_pointer = INVALID_PAGE; - +#if IS_ENABLED(CONFIG_HYPERV) + vmx->hv_root_ept = INVALID_PAGE; +#endif return 0; free_vmcs: @@ -7007,7 +6942,9 @@ free_vpid: static int vmx_vm_init(struct kvm *kvm) { - spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); +#if IS_ENABLED(CONFIG_HYPERV) + spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock); +#endif if (!ple_gap) kvm->arch.pause_in_guest = true; @@ -7848,7 +7785,8 @@ static __init int hardware_setup(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ if (enable_ept) - vmx_enable_tdp(); + kvm_mmu_set_ept_masks(enable_ept_ad_bits, + cpu_has_vmx_ept_execute_only()); if (!enable_ept) ept_lpage_level = 0; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 89da5e1251f1..0fb3236b0283 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -325,7 +325,9 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; - u64 ept_pointer; +#if IS_ENABLED(CONFIG_HYPERV) + u64 hv_root_ept; +#endif struct pt_desc pt_desc; struct lbr_desc lbr_desc; @@ -338,12 +340,6 @@ struct vcpu_vmx { } shadow_msr_intercept; }; -enum ept_pointers_status { - EPT_POINTERS_CHECK = 0, - EPT_POINTERS_MATCH = 1, - EPT_POINTERS_MISMATCH = 2 -}; - struct kvm_vmx { struct kvm kvm; @@ -351,8 +347,10 @@ struct kvm_vmx { bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; - enum ept_pointers_status ept_pointers_match; - spinlock_t ept_pointer_lock; +#if IS_ENABLED(CONFIG_HYPERV) + hpa_t hv_root_ept; + spinlock_t hv_root_ept_lock; +#endif }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); @@ -376,8 +374,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, - int root_level); +u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 47e021bdcc94..a9d95f90a048 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -245,6 +245,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("l1d_flush", l1d_flush), VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT("nested_run", nested_run), VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), VM_STAT("mmu_pte_write", mmu_pte_write), VM_STAT("mmu_pde_zapped", mmu_pde_zapped), @@ -544,8 +545,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: - if (has_error && !is_protmode(vcpu)) - has_error = false; if (reinject) { /* * On vmentry, vcpu->arch.exception.pending is only @@ -984,14 +983,17 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) return 0; } -int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { - if (static_call(kvm_x86_get_cpl)(vcpu) == 0) - return __kvm_set_xcr(vcpu, index, xcr); + if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || + __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { + kvm_inject_gp(vcpu, 0); + return 1; + } - return 1; + return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_set_xcr); +EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1192,20 +1194,21 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) } EXPORT_SYMBOL_GPL(kvm_get_dr); -bool kvm_rdpmc(struct kvm_vcpu *vcpu) +int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; - int err; - err = kvm_pmu_rdpmc(vcpu, ecx, &data); - if (err) - return err; + if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_rax_write(vcpu, (u32)data); kvm_rdx_write(vcpu, data >> 32); - return err; + return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_rdpmc); +EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS @@ -1783,6 +1786,40 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); +int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) +{ + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_as_nop); + +int kvm_emulate_invd(struct kvm_vcpu *vcpu) +{ + /* Treat an INVD instruction as a NOP and just skip it. */ + return kvm_emulate_as_nop(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_invd); + +int kvm_emulate_mwait(struct kvm_vcpu *vcpu) +{ + pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); + return kvm_emulate_as_nop(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_mwait); + +int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} +EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); + +int kvm_emulate_monitor(struct kvm_vcpu *vcpu) +{ + pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); + return kvm_emulate_as_nop(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_monitor); + static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { xfer_to_guest_mode_prepare(); @@ -8004,9 +8041,6 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK, 0, sme_me_mask); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -8328,6 +8362,27 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr); } + +int kvm_check_nested_events(struct kvm_vcpu *vcpu) +{ + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) + return -EIO; + + if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { + kvm_x86_ops.nested_ops->triple_fault(vcpu); + return 1; + } + + return kvm_x86_ops.nested_ops->check_events(vcpu); +} + +static void kvm_inject_exception(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) + vcpu->arch.exception.error_code = false; + static_call(kvm_x86_queue_exception)(vcpu); +} + static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) { int r; @@ -8336,7 +8391,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit /* try to reinject previous events if any */ if (vcpu->arch.exception.injected) { - static_call(kvm_x86_queue_exception)(vcpu); + kvm_inject_exception(vcpu); can_inject = false; } /* @@ -8373,7 +8428,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit * from L2 to L1. */ if (is_guest_mode(vcpu)) { - r = kvm_x86_ops.nested_ops->check_events(vcpu); + r = kvm_check_nested_events(vcpu); if (r < 0) goto busy; } @@ -8399,7 +8454,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit } } - static_call(kvm_x86_queue_exception)(vcpu); + kvm_inject_exception(vcpu); can_inject = false; } @@ -8936,10 +8991,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; - vcpu->mmio_needed = 0; - r = 0; - goto out; + if (is_guest_mode(vcpu)) { + kvm_x86_ops.nested_ops->triple_fault(vcpu); + } else { + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->mmio_needed = 0; + r = 0; + goto out; + } } if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { /* Page is swapped out. Do synthetic halt */ @@ -9237,7 +9296,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) - kvm_x86_ops.nested_ops->check_events(vcpu); + kvm_check_nested_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@ -11503,7 +11562,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) fallthrough; case INVPCID_TYPE_ALL_INCL_GLOBAL: - kvm_mmu_unload(vcpu); + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); return kvm_skip_emulated_instruction(vcpu); default: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 39eb04887141..daccf20fbcd5 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,14 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ +({ \ + bool failed = (consistency_check); \ + if (failed) \ + trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ + failed; \ +}) + #define KVM_DEFAULT_PLE_GAP 128 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 #define KVM_DEFAULT_PLE_WINDOW_GROW 2 @@ -48,6 +56,8 @@ static inline unsigned int __shrink_ple_window(unsigned int val, #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL +int kvm_check_nested_events(struct kvm_vcpu *vcpu); + static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { vcpu->arch.exception.pending = false; |