diff options
30 files changed, 263 insertions, 226 deletions
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f38ef299f13b..e9c9388ccc02 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -929,6 +929,10 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); (system_supports_mte() && \ test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags)) +#define kvm_supports_32bit_el0() \ + (system_supports_32bit_el0() && \ + !static_branch_unlikely(&arm64_mismatched_32bit_el0)) + int kvm_trng_call(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM extern phys_addr_t hyp_mem_base; diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 3bb134355874..316917b98707 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -75,9 +75,11 @@ struct kvm_regs { /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ #define KVM_ARM_DEVICE_TYPE_SHIFT 0 -#define KVM_ARM_DEVICE_TYPE_MASK (0xffff << KVM_ARM_DEVICE_TYPE_SHIFT) +#define KVM_ARM_DEVICE_TYPE_MASK GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ + KVM_ARM_DEVICE_TYPE_SHIFT) #define KVM_ARM_DEVICE_ID_SHIFT 16 -#define KVM_ARM_DEVICE_ID_MASK (0xffff << KVM_ARM_DEVICE_ID_SHIFT) +#define KVM_ARM_DEVICE_ID_MASK GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ + KVM_ARM_DEVICE_ID_SHIFT) /* Supported device IDs */ #define KVM_ARM_DEVICE_VGIC_V2 0 diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 986cee6fbc7f..2ff0ef62abad 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -757,8 +757,7 @@ static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) if (likely(!vcpu_mode_is_32bit(vcpu))) return false; - return !system_supports_32bit_el0() || - static_branch_unlikely(&arm64_mismatched_32bit_el0); + return !kvm_supports_32bit_el0(); } /** diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 8c607199cad1..f802a3b3f8db 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -242,7 +242,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) u64 mode = (*(u64 *)valp) & PSR_AA32_MODE_MASK; switch (mode) { case PSR_AA32_MODE_USR: - if (!system_supports_32bit_el0()) + if (!kvm_supports_32bit_el0()) return -EINVAL; break; case PSR_AA32_MODE_FIQ: diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 87f1cd0df36e..c9a13e487187 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -993,7 +993,7 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, * THP doesn't start to split while we are adjusting the * refcounts. * - * We are sure this doesn't happen, because mmu_notifier_retry + * We are sure this doesn't happen, because mmu_invalidate_retry * was successful and we are holding the mmu_lock, so if this * THP is trying to split, it will be blocked in the mmu * notifier before touching any of the pages, specifically @@ -1188,9 +1188,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return ret; } - mmu_seq = vcpu->kvm->mmu_notifier_seq; + mmu_seq = vcpu->kvm->mmu_invalidate_seq; /* - * Ensure the read of mmu_notifier_seq happens before we call + * Ensure the read of mmu_invalidate_seq happens before we call * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk * the page we just got a reference to gets unmapped before we have a * chance to grab the mmu_lock, which ensure that if the page gets @@ -1246,7 +1246,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, else write_lock(&kvm->mmu_lock); pgt = vcpu->arch.hw_mmu->pgt; - if (mmu_notifier_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; /* diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c059b259aea6..3234f50b8c4b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -652,7 +652,7 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) */ val = ((pmcr & ~ARMV8_PMU_PMCR_MASK) | (ARMV8_PMU_PMCR_MASK & 0xdecafbad)) & (~ARMV8_PMU_PMCR_E); - if (!system_supports_32bit_el0()) + if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; __vcpu_sys_reg(vcpu, r->reg) = val; } @@ -701,7 +701,7 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, val = __vcpu_sys_reg(vcpu, PMCR_EL0); val &= ~ARMV8_PMU_PMCR_MASK; val |= p->regval & ARMV8_PMU_PMCR_MASK; - if (!system_supports_32bit_el0()) + if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; __vcpu_sys_reg(vcpu, PMCR_EL0) = val; kvm_pmu_handle_pmcr(vcpu, val); diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 717716cc51c5..5cedb28e8a40 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -84,8 +84,6 @@ #define KVM_MAX_VCPUS 16 -/* memory slots that does not exposed to userspace */ -#define KVM_PRIVATE_MEM_SLOTS 0 #define KVM_HALT_POLL_NS_DEFAULT 500000 diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index db17e870bdff..74cd64a24d05 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -615,17 +615,17 @@ retry: * Used to check for invalidations in progress, of the pfn that is * returned by pfn_to_pfn_prot below. */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; /* - * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in - * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't + * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads + * in gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't * risk the page we get a reference to getting unmapped before we have a - * chance to grab the mmu_lock without mmu_notifier_retry() noticing. + * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. * * This smp_rmb() pairs with the effective smp_wmb() of the combination * of the pte_unmap_unlock() after the PTE is zapped, and the * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before - * mmu_notifier_seq is incremented. + * mmu_invalidate_seq is incremented. */ smp_rmb(); @@ -638,7 +638,7 @@ retry: spin_lock(&kvm->mmu_lock); /* Check if an invalidation has taken place since we got pfn */ - if (mmu_notifier_retry(kvm, mmu_seq)) { + if (mmu_invalidate_retry(kvm, mmu_seq)) { /* * This can happen when mappings are changed asynchronously, but * also synchronously if a COW is triggered by diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 4def2bd17b9b..d49065af08e9 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -666,7 +666,7 @@ static inline pte_t *find_kvm_host_pte(struct kvm *kvm, unsigned long mmu_seq, VM_WARN(!spin_is_locked(&kvm->mmu_lock), "%s called with kvm mmu_lock not held \n", __func__); - if (mmu_notifier_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) return NULL; pte = __find_linux_pte(kvm->mm->pgd, ea, NULL, hshift); diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 1ae09992c9ea..bc6a381b5346 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -90,7 +90,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, unsigned long pfn; /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* Get host physical address for gpa */ @@ -151,7 +151,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, cpte = kvmppc_mmu_hpte_cache_next(vcpu); spin_lock(&kvm->mmu_lock); - if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) { + if (!cpte || mmu_invalidate_retry(kvm, mmu_seq)) { r = -EAGAIN; goto out_unlock; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 514fd45c1994..e9744b41a226 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -578,7 +578,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, return -EFAULT; /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); ret = -EFAULT; @@ -693,7 +693,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, /* Check if we might have been invalidated; let the guest retry if so */ ret = RESUME_GUEST; - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { + if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) { unlock_rmap(rmap); goto out_unlock; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 9d4b3feda3b6..5d5e12f3bf86 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -640,7 +640,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Check if we might have been invalidated; let the guest retry if so */ spin_lock(&kvm->mmu_lock); ret = -EAGAIN; - if (mmu_notifier_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; /* Now traverse again under the lock and change the tree */ @@ -830,7 +830,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, bool large_enable; /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* @@ -1191,7 +1191,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm, * Increase the mmu notifier sequence number to prevent any page * fault that read the memslot earlier from writing a PTE. */ - kvm->mmu_notifier_seq++; + kvm->mmu_invalidate_seq++; spin_unlock(&kvm->mmu_lock); } diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index be8249cc6107..5a64a1341e6f 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -1580,7 +1580,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, /* 2. Find the host pte for this L1 guest real address */ /* Used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* See if can find translation in our partition scoped tables for L1 */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 2257fb18cb72..5a05953ae13f 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -219,7 +219,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, g_ptel = ptel; /* used later to detect if we might have been invalidated */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* Find the memslot (if any) for this address */ @@ -366,7 +366,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, rmap = real_vmalloc_addr(rmap); lock_rmap(rmap); /* Check for pending invalidations under the rmap chain lock */ - if (mmu_notifier_retry(kvm, mmu_seq)) { + if (mmu_invalidate_retry(kvm, mmu_seq)) { /* inval in progress, write a non-present HPTE */ pteh |= HPTE_V_ABSENT; pteh &= ~HPTE_V_VALID; @@ -932,7 +932,7 @@ static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu, int i; /* Used later to detect if we might have been invalidated */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock); @@ -960,7 +960,7 @@ static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu, long ret = H_SUCCESS; /* Used later to detect if we might have been invalidated */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock); diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 7f16afc331ef..05668e964140 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -339,7 +339,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, unsigned long flags; /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* @@ -460,7 +460,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } spin_lock(&kvm->mmu_lock); - if (mmu_notifier_retry(kvm, mmu_seq)) { + if (mmu_invalidate_retry(kvm, mmu_seq)) { ret = -EAGAIN; goto out; } diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 3a35b2d95697..3620ecac2fa1 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -666,7 +666,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, return ret; } - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writable); if (hfn == KVM_PFN_ERR_HWPOISON) { @@ -686,7 +686,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, spin_lock(&kvm->mmu_lock); - if (mmu_notifier_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; if (writable) { diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2db93498ff71..75cdd11ab014 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4052,8 +4052,9 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) /* Disable guest PEBS if host PEBS is enabled. */ arr[pebs_enable].guest = 0; } else { - /* Disable guest PEBS for cross-mapped PEBS counters. */ + /* Disable guest PEBS thoroughly for cross-mapped PEBS counters. */ arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; + arr[global_ctrl].guest &= ~kvm_pmu->host_cross_mapped_mask; /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ arr[global_ctrl].guest |= arr[pebs_enable].guest; } diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index 689880eca9ba..9b08082a5d9f 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -31,6 +31,16 @@ #define __noendbr __attribute__((nocf_check)) +/* + * Create a dummy function pointer reference to prevent objtool from marking + * the function as needing to be "sealed" (i.e. ENDBR converted to NOP by + * apply_ibt_endbr()). + */ +#define IBT_NOSEAL(fname) \ + ".pushsection .discard.ibt_endbr_noseal\n\t" \ + _ASM_PTR fname "\n\t" \ + ".popsection\n\t" + static inline __attribute_const__ u32 gen_endbr(void) { u32 endbr; @@ -84,6 +94,7 @@ extern __noendbr void ibt_restore(u64 save); #ifndef __ASSEMBLY__ #define ASM_ENDBR +#define IBT_NOSEAL(name) #define __noendbr diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5ffa578cafe1..2c96c43c313a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -53,7 +53,7 @@ #define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) /* memory slots that are not exposed to userspace */ -#define KVM_PRIVATE_MEM_SLOTS 3 +#define KVM_INTERNAL_MEM_SLOTS 3 #define KVM_HALT_POLL_NS_DEFAULT 200000 diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b4eeb7c75dfa..f092c54d1a2f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -326,7 +326,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); ".align " __stringify(FASTOP_SIZE) " \n\t" \ ".type " name ", @function \n\t" \ name ":\n\t" \ - ASM_ENDBR + ASM_ENDBR \ + IBT_NOSEAL(name) #define FOP_FUNC(name) \ __FOP_FUNC(#name) @@ -446,27 +447,12 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); FOP_END /* Special case for SETcc - 1 instruction per cc */ - -/* - * Depending on .config the SETcc functions look like: - * - * ENDBR [4 bytes; CONFIG_X86_KERNEL_IBT] - * SETcc %al [3 bytes] - * RET | JMP __x86_return_thunk [1,5 bytes; CONFIG_RETHUNK] - * INT3 [1 byte; CONFIG_SLS] - */ -#define SETCC_ALIGN 16 - #define FOP_SETCC(op) \ - ".align " __stringify(SETCC_ALIGN) " \n\t" \ - ".type " #op ", @function \n\t" \ - #op ": \n\t" \ - ASM_ENDBR \ + FOP_FUNC(op) \ #op " %al \n\t" \ - __FOP_RET(#op) \ - ".skip " __stringify(SETCC_ALIGN) " - (.-" #op "), 0xcc \n\t" + FOP_RET(op) -__FOP_START(setcc, SETCC_ALIGN) +FOP_START(setcc) FOP_SETCC(seto) FOP_SETCC(setno) FOP_SETCC(setc) @@ -1079,7 +1065,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt) static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) { u8 rc; - void (*fop)(void) = (void *)em_setcc + SETCC_ALIGN * (condition & 0xf); + void (*fop)(void) = (void *)em_setcc + FASTOP_SIZE * (condition & 0xf); flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; asm("push %[flags]; popf; " CALL_NOSPEC diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index eccddb136954..e418ef3ecfcb 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2914,7 +2914,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ - if (unlikely(vcpu->kvm->mmu_notifier_count)) + if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; __direct_pte_prefetch(vcpu, sp, sptep); @@ -2928,7 +2928,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) * * There are several ways to safely use this helper: * - * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before + * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before * consuming it. In this case, mmu_lock doesn't need to be held during the * lookup, but it does need to be held while checking the MMU notifier. * @@ -3056,7 +3056,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return; /* - * mmu_notifier_retry() was successful and mmu_lock is held, so + * mmu_invalidate_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ fault->goal_level = fault->req_level; @@ -4203,7 +4203,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, return true; return fault->slot && - mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); + mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -4227,7 +4227,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - mmu_seq = vcpu->kvm->mmu_notifier_seq; + mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); r = kvm_faultin_pfn(vcpu, fault); @@ -5361,19 +5361,6 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } -static bool need_remote_flush(u64 old, u64 new) -{ - if (!is_shadow_present_pte(old)) - return false; - if (!is_shadow_present_pte(new)) - return true; - if ((old ^ new) & SPTE_BASE_ADDR_MASK) - return true; - old ^= shadow_nx_mask; - new ^= shadow_nx_mask; - return (old & ~new & SPTE_PERM_MASK) != 0; -} - static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) { @@ -5519,7 +5506,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && sp->role.level != PG_LEVEL_4K) ++vcpu->kvm->stat.mmu_pde_zapped; - if (need_remote_flush(entry, *spte)) + if (is_shadow_present_pte(entry)) flush = true; ++spte; } @@ -6055,7 +6042,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) write_lock(&kvm->mmu_lock); - kvm_inc_notifier_count(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end); flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); @@ -6069,7 +6056,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end - gfn_start); - kvm_dec_notifier_count(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end); write_unlock(&kvm->mmu_lock); } @@ -6085,47 +6072,18 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level) { - bool flush = false; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, KVM_MAX_HUGEPAGE_LEVEL, - false); + slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, false); write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); + kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); read_unlock(&kvm->mmu_lock); } - - /* - * Flush TLBs if any SPTEs had to be write-protected to ensure that - * guest writes are reflected in the dirty bitmap before the memslot - * update completes, i.e. before enabling dirty logging is visible to - * userspace. - * - * Perform the TLB flush outside the mmu_lock to reduce the amount of - * time the lock is held. However, this does mean that another CPU can - * now grab mmu_lock and encounter a write-protected SPTE while CPUs - * still have a writable mapping for the associated GFN in their TLB. - * - * This is safe but requires KVM to be careful when making decisions - * based on the write-protection status of an SPTE. Specifically, KVM - * also write-protects SPTEs to monitor changes to guest page tables - * during shadow paging, and must guarantee no CPUs can write to those - * page before the lock is dropped. As mentioned in the previous - * paragraph, a write-protected SPTE is no guarantee that CPU cannot - * perform writes. So to determine if a TLB flush is truly required, KVM - * will clear a separate software-only bit (MMU-writable) and skip the - * flush if-and-only-if this bit was already clear. - * - * See is_writable_pte() for more details. - */ - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) @@ -6493,32 +6451,30 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot) { - bool flush = false; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); /* * Clear dirty bits only on 4k SPTEs since the legacy MMU only * support dirty logging at a 4k granularity. */ - flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); + slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); + kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); read_unlock(&kvm->mmu_lock); } /* + * The caller will flush the TLBs after this function returns. + * * It's also safe to flush TLBs out of mmu lock here as currently this * function is only used for dirty logging, in which case flushing TLB * out of mmu lock also guarantees no dirty pages will be lost in * dirty_bitmap. */ - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } void kvm_mmu_zap_all(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f5958071220c..39e0205e7300 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -589,7 +589,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ - if (unlikely(vcpu->kvm->mmu_notifier_count)) + if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; if (sp->role.direct) @@ -838,7 +838,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else fault->max_level = walker.level; - mmu_seq = vcpu->kvm->mmu_notifier_seq; + mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); r = kvm_faultin_pfn(vcpu, fault); diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index f3744eea45f5..7670c13ce251 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -343,7 +343,7 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, } /* - * An shadow-present leaf SPTE may be non-writable for 3 possible reasons: + * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: * * 1. To intercept writes for dirty logging. KVM write-protects huge pages * so that they can be split be split down into the dirty logging @@ -361,8 +361,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, * read-only memslot or guest memory backed by a read-only VMA. Writes to * such pages are disallowed entirely. * - * To keep track of why a given SPTE is write-protected, KVM uses 2 - * software-only bits in the SPTE: + * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this + * case, the SPTE is access-protected, not just write-protected! + * + * For cases #1 and #4, KVM can safely make such SPTEs writable without taking + * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. + * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits + * in the SPTE: * * shadow_mmu_writable_mask, aka MMU-writable - * Cleared on SPTEs that KVM is currently write-protecting for shadow paging @@ -391,7 +396,8 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging * (which does not clear the MMU-writable bit), does not flush TLBs before * dropping the lock, as it only needs to synchronize guest writes with the - * dirty bitmap. + * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for + * access-tracking via the clear_young() MMU notifier also does not flush TLBs. * * So, there is the problem: clearing the MMU-writable bit can encounter a * write-protected SPTE while CPUs still have writable mappings for that SPTE diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d7f8331d6f7e..c9b49a09e6b5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -843,8 +843,7 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) return true; - return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, - MSR_IA32_SPEC_CTRL); + return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); } unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 205ebdc2b11b..43a6a7efc6ec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1557,12 +1557,32 @@ static const u32 msr_based_features_all[] = { static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; static unsigned int num_msr_based_features; +/* + * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM + * does not yet virtualize. These include: + * 10 - MISC_PACKAGE_CTRLS + * 11 - ENERGY_FILTERING_CTL + * 12 - DOITM + * 18 - FB_CLEAR_CTRL + * 21 - XAPIC_DISABLE_STATUS + * 23 - OVERCLOCKING_STATUS + */ + +#define KVM_SUPPORTED_ARCH_CAP \ + (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ + ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ + ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ + ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO) + static u64 kvm_get_arch_capabilities(void) { u64 data = 0; - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); + data &= KVM_SUPPORTED_ARCH_CAP; + } /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -1610,9 +1630,6 @@ static u64 kvm_get_arch_capabilities(void) */ } - /* Guests don't need to know "Fill buffer clear control" exists */ - data &= ~ARCH_CAP_FB_CLEAR_CTRL; - return data; } @@ -10652,7 +10669,8 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) case KVM_MP_STATE_INIT_RECEIVED: break; default: - return -EINTR; + WARN_ON_ONCE(1); + break; } return 1; } @@ -11093,9 +11111,22 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, vcpu_load(vcpu); - if (!lapic_in_kernel(vcpu) && - mp_state->mp_state != KVM_MP_STATE_RUNNABLE) + switch (mp_state->mp_state) { + case KVM_MP_STATE_UNINITIALIZED: + case KVM_MP_STATE_HALTED: + case KVM_MP_STATE_AP_RESET_HOLD: + case KVM_MP_STATE_INIT_RECEIVED: + case KVM_MP_STATE_SIPI_RECEIVED: + if (!lapic_in_kernel(vcpu)) + goto out; + break; + + case KVM_MP_STATE_RUNNABLE: + break; + + default: goto out; + } /* * KVM_MP_STATE_INIT_RECEIVED means the processor is in @@ -11563,7 +11594,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64), GFP_KERNEL_ACCOUNT); if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) - goto fail_free_pio_data; + goto fail_free_mce_banks; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, @@ -11617,7 +11648,6 @@ free_wbinvd_dirty_mask: fail_free_mce_banks: kfree(vcpu->arch.mce_banks); kfree(vcpu->arch.mci_ctl2_banks); -fail_free_pio_data: free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: kvm_free_lapic(vcpu); @@ -12473,6 +12503,50 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, } else { kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); } + + /* + * Unconditionally flush the TLBs after enabling dirty logging. + * A flush is almost always going to be necessary (see below), + * and unconditionally flushing allows the helpers to omit + * the subtly complex checks when removing write access. + * + * Do the flush outside of mmu_lock to reduce the amount of + * time mmu_lock is held. Flushing after dropping mmu_lock is + * safe as KVM only needs to guarantee the slot is fully + * write-protected before returning to userspace, i.e. before + * userspace can consume the dirty status. + * + * Flushing outside of mmu_lock requires KVM to be careful when + * making decisions based on writable status of an SPTE, e.g. a + * !writable SPTE doesn't guarantee a CPU can't perform writes. + * + * Specifically, KVM also write-protects guest page tables to + * monitor changes when using shadow paging, and must guarantee + * no CPUs can write to those page before mmu_lock is dropped. + * Because CPUs may have stale TLB entries at this point, a + * !writable SPTE doesn't guarantee CPUs can't perform writes. + * + * KVM also allows making SPTES writable outside of mmu_lock, + * e.g. to allow dirty logging without taking mmu_lock. + * + * To handle these scenarios, KVM uses a separate software-only + * bit (MMU-writable) to track if a SPTE is !writable due to + * a guest page table being write-protected (KVM clears the + * MMU-writable flag when write-protecting for shadow paging). + * + * The use of MMU-writable is also the primary motivation for + * the unconditional flush. Because KVM must guarantee that a + * CPU doesn't contain stale, writable TLB entries for a + * !MMU-writable SPTE, KVM must flush if it encounters any + * MMU-writable SPTE regardless of whether the actual hardware + * writable bit was set. I.e. KVM is almost guaranteed to need + * to flush, while unconditionally flushing allows the "remove + * write access" helpers to ignore MMU-writable entirely. + * + * See is_writable_pte() for more details (the case involving + * access-tracked SPTEs is particularly relevant). + */ + kvm_arch_flush_remote_tlbs_memslot(kvm, new); } } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1c480b1821e1..f4519d3689e1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -656,12 +656,12 @@ struct kvm_irq_routing_table { }; #endif -#ifndef KVM_PRIVATE_MEM_SLOTS -#define KVM_PRIVATE_MEM_SLOTS 0 +#ifndef KVM_INTERNAL_MEM_SLOTS +#define KVM_INTERNAL_MEM_SLOTS 0 #endif #define KVM_MEM_SLOTS_NUM SHRT_MAX -#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_PRIVATE_MEM_SLOTS) +#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS) #ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) @@ -765,10 +765,10 @@ struct kvm { #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) struct mmu_notifier mmu_notifier; - unsigned long mmu_notifier_seq; - long mmu_notifier_count; - unsigned long mmu_notifier_range_start; - unsigned long mmu_notifier_range_end; + unsigned long mmu_invalidate_seq; + long mmu_invalidate_in_progress; + unsigned long mmu_invalidate_range_start; + unsigned long mmu_invalidate_range_end; #endif struct list_head devices; u64 manual_dirty_log_protect; @@ -1357,10 +1357,10 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif -void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, - unsigned long end); -void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, - unsigned long end); +void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, + unsigned long end); +void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, + unsigned long end); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); @@ -1907,42 +1907,44 @@ extern const struct kvm_stats_header kvm_vcpu_stats_header; extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) -static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) +static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { - if (unlikely(kvm->mmu_notifier_count)) + if (unlikely(kvm->mmu_invalidate_in_progress)) return 1; /* - * Ensure the read of mmu_notifier_count happens before the read - * of mmu_notifier_seq. This interacts with the smp_wmb() in - * mmu_notifier_invalidate_range_end to make sure that the caller - * either sees the old (non-zero) value of mmu_notifier_count or - * the new (incremented) value of mmu_notifier_seq. - * PowerPC Book3s HV KVM calls this under a per-page lock - * rather than under kvm->mmu_lock, for scalability, so - * can't rely on kvm->mmu_lock to keep things ordered. + * Ensure the read of mmu_invalidate_in_progress happens before + * the read of mmu_invalidate_seq. This interacts with the + * smp_wmb() in mmu_notifier_invalidate_range_end to make sure + * that the caller either sees the old (non-zero) value of + * mmu_invalidate_in_progress or the new (incremented) value of + * mmu_invalidate_seq. + * + * PowerPC Book3s HV KVM calls this under a per-page lock rather + * than under kvm->mmu_lock, for scalability, so can't rely on + * kvm->mmu_lock to keep things ordered. */ smp_rmb(); - if (kvm->mmu_notifier_seq != mmu_seq) + if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; } -static inline int mmu_notifier_retry_hva(struct kvm *kvm, - unsigned long mmu_seq, - unsigned long hva) +static inline int mmu_invalidate_retry_hva(struct kvm *kvm, + unsigned long mmu_seq, + unsigned long hva) { lockdep_assert_held(&kvm->mmu_lock); /* - * If mmu_notifier_count is non-zero, then the range maintained by - * kvm_mmu_notifier_invalidate_range_start contains all addresses that - * might be being invalidated. Note that it may include some false + * If mmu_invalidate_in_progress is non-zero, then the range maintained + * by kvm_mmu_notifier_invalidate_range_start contains all addresses + * that might be being invalidated. Note that it may include some false * positives, due to shortcuts when handing concurrent invalidations. */ - if (unlikely(kvm->mmu_notifier_count) && - hva >= kvm->mmu_notifier_range_start && - hva < kvm->mmu_notifier_range_end) + if (unlikely(kvm->mmu_invalidate_in_progress) && + hva >= kvm->mmu_invalidate_range_start && + hva < kvm->mmu_invalidate_range_end) return 1; - if (kvm->mmu_notifier_seq != mmu_seq) + if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0cec74da7ffe..91678252a9b6 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4096,7 +4096,8 @@ static int validate_ibt(struct objtool_file *file) * These sections can reference text addresses, but not with * the intent to indirect branch to them. */ - if (!strncmp(sec->name, ".discard", 8) || + if ((!strncmp(sec->name, ".discard", 8) && + strcmp(sec->name, ".discard.ibt_endbr_noseal")) || !strncmp(sec->name, ".debug", 6) || !strcmp(sec->name, ".altinstructions") || !strcmp(sec->name, ".ibt_endbr_seal") || diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 45edf45821d0..0cbc71b7af50 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -754,7 +754,7 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); /* If a toddler were to say "abracadabra". */ -#define KVM_EXCEPTION_MAGIC 0xabacadabaull +#define KVM_EXCEPTION_MAGIC 0xabacadabaULL /* * KVM selftest exception fixup uses registers to coordinate with the exception @@ -786,7 +786,7 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, "lea 1f(%%rip), %%r10\n\t" \ "lea 2f(%%rip), %%r11\n\t" \ "1: " insn "\n\t" \ - "mov $0, %[vector]\n\t" \ + "movb $0, %[vector]\n\t" \ "jmp 3f\n\t" \ "2:\n\t" \ "mov %%r9b, %[vector]\n\t" \ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 515dfe9d3bcf..584a5bab3af3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -702,30 +702,31 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, /* * .change_pte() must be surrounded by .invalidate_range_{start,end}(). - * If mmu_notifier_count is zero, then no in-progress invalidations, - * including this one, found a relevant memslot at start(); rechecking - * memslots here is unnecessary. Note, a false positive (count elevated - * by a different invalidation) is sub-optimal but functionally ok. + * If mmu_invalidate_in_progress is zero, then no in-progress + * invalidations, including this one, found a relevant memslot at + * start(); rechecking memslots here is unnecessary. Note, a false + * positive (count elevated by a different invalidation) is sub-optimal + * but functionally ok. */ WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); - if (!READ_ONCE(kvm->mmu_notifier_count)) + if (!READ_ONCE(kvm->mmu_invalidate_in_progress)) return; kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); } -void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, - unsigned long end) +void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, + unsigned long end) { /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and * count is also read inside the mmu_lock critical section. */ - kvm->mmu_notifier_count++; - if (likely(kvm->mmu_notifier_count == 1)) { - kvm->mmu_notifier_range_start = start; - kvm->mmu_notifier_range_end = end; + kvm->mmu_invalidate_in_progress++; + if (likely(kvm->mmu_invalidate_in_progress == 1)) { + kvm->mmu_invalidate_range_start = start; + kvm->mmu_invalidate_range_end = end; } else { /* * Fully tracking multiple concurrent ranges has diminishing @@ -736,10 +737,10 @@ void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, * accumulate and persist until all outstanding invalidates * complete. */ - kvm->mmu_notifier_range_start = - min(kvm->mmu_notifier_range_start, start); - kvm->mmu_notifier_range_end = - max(kvm->mmu_notifier_range_end, end); + kvm->mmu_invalidate_range_start = + min(kvm->mmu_invalidate_range_start, start); + kvm->mmu_invalidate_range_end = + max(kvm->mmu_invalidate_range_end, end); } } @@ -752,7 +753,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, .end = range->end, .pte = __pte(0), .handler = kvm_unmap_gfn_range, - .on_lock = kvm_inc_notifier_count, + .on_lock = kvm_mmu_invalidate_begin, .on_unlock = kvm_arch_guest_memory_reclaimed, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), @@ -763,7 +764,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, /* * Prevent memslot modification between range_start() and range_end() * so that conditionally locking provides the same result in both - * functions. Without that guarantee, the mmu_notifier_count + * functions. Without that guarantee, the mmu_invalidate_in_progress * adjustments will be imbalanced. * * Pairs with the decrement in range_end(). @@ -779,7 +780,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * any given time, and the caches themselves can check for hva overlap, * i.e. don't need to rely on memslot overlap checks for performance. * Because this runs without holding mmu_lock, the pfn caches must use - * mn_active_invalidate_count (see above) instead of mmu_notifier_count. + * mn_active_invalidate_count (see above) instead of + * mmu_invalidate_in_progress. */ gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, hva_range.may_block); @@ -789,22 +791,22 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, return 0; } -void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, - unsigned long end) +void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, + unsigned long end) { /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have * been freed. */ - kvm->mmu_notifier_seq++; + kvm->mmu_invalidate_seq++; smp_wmb(); /* * The above sequence increase must be visible before the * below count decrease, which is ensured by the smp_wmb above - * in conjunction with the smp_rmb in mmu_notifier_retry(). + * in conjunction with the smp_rmb in mmu_invalidate_retry(). */ - kvm->mmu_notifier_count--; + kvm->mmu_invalidate_in_progress--; } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, @@ -816,7 +818,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, .end = range->end, .pte = __pte(0), .handler = (void *)kvm_null_fn, - .on_lock = kvm_dec_notifier_count, + .on_lock = kvm_mmu_invalidate_end, .on_unlock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), @@ -837,7 +839,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, if (wake) rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); - BUG_ON(kvm->mmu_notifier_count < 0); + BUG_ON(kvm->mmu_invalidate_in_progress < 0); } static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, @@ -1134,6 +1136,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) if (!kvm) return ERR_PTR(-ENOMEM); + /* KVM is pinned via open("/dev/kvm"), the fd passed to this ioctl(). */ + __module_get(kvm_chardev_ops.owner); + KVM_MMU_LOCK_INIT(kvm); mmgrab(current->mm); kvm->mm = current->mm; @@ -1211,9 +1216,17 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) if (r) goto out_err_no_mmu_notifier; + r = kvm_coalesced_mmio_init(kvm); + if (r < 0) + goto out_no_coalesced_mmio; + + r = kvm_create_vm_debugfs(kvm, fdname); + if (r) + goto out_err_no_debugfs; + r = kvm_arch_post_init_vm(kvm); if (r) - goto out_err_mmu_notifier; + goto out_err; mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); @@ -1222,25 +1235,13 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) preempt_notifier_inc(); kvm_init_pm_notifier(kvm); - /* - * When the fd passed to this ioctl() is opened it pins the module, - * but try_module_get() also prevents getting a reference if the module - * is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait"). - */ - if (!try_module_get(kvm_chardev_ops.owner)) { - r = -ENODEV; - goto out_err_mmu_notifier; - } - - r = kvm_create_vm_debugfs(kvm, fdname); - if (r) - goto out_err; - return kvm; out_err: - module_put(kvm_chardev_ops.owner); -out_err_mmu_notifier: + kvm_destroy_vm_debugfs(kvm); +out_err_no_debugfs: + kvm_coalesced_mmio_free(kvm); +out_no_coalesced_mmio: #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) if (kvm->mmu_notifier.ops) mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); @@ -1259,6 +1260,7 @@ out_err_no_irq_srcu: out_err_no_srcu: kvm_arch_free_vm(kvm); mmdrop(current->mm); + module_put(kvm_chardev_ops.owner); return ERR_PTR(r); } @@ -2516,7 +2518,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, { unsigned int flags = FOLL_HWPOISON; struct page *page; - int npages = 0; + int npages; might_sleep(); @@ -4378,7 +4380,7 @@ void kvm_unregister_device_ops(u32 type) static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_create_device *cd) { - const struct kvm_device_ops *ops = NULL; + const struct kvm_device_ops *ops; struct kvm_device *dev; bool test = cd->flags & KVM_CREATE_DEVICE_TEST; int type; @@ -4913,11 +4915,6 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) goto put_fd; } -#ifdef CONFIG_KVM_MMIO - r = kvm_coalesced_mmio_init(kvm); - if (r < 0) - goto put_kvm; -#endif file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); if (IS_ERR(file)) { r = PTR_ERR(file); diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index ab519f72f2cd..68ff41d39545 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -112,27 +112,28 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s { /* * mn_active_invalidate_count acts for all intents and purposes - * like mmu_notifier_count here; but the latter cannot be used - * here because the invalidation of caches in the mmu_notifier - * event occurs _before_ mmu_notifier_count is elevated. + * like mmu_invalidate_in_progress here; but the latter cannot + * be used here because the invalidation of caches in the + * mmu_notifier event occurs _before_ mmu_invalidate_in_progress + * is elevated. * * Note, it does not matter that mn_active_invalidate_count * is not protected by gpc->lock. It is guaranteed to * be elevated before the mmu_notifier acquires gpc->lock, and - * isn't dropped until after mmu_notifier_seq is updated. + * isn't dropped until after mmu_invalidate_seq is updated. */ if (kvm->mn_active_invalidate_count) return true; /* * Ensure mn_active_invalidate_count is read before - * mmu_notifier_seq. This pairs with the smp_wmb() in + * mmu_invalidate_seq. This pairs with the smp_wmb() in * mmu_notifier_invalidate_range_end() to guarantee either the * old (non-zero) value of mn_active_invalidate_count or the - * new (incremented) value of mmu_notifier_seq is observed. + * new (incremented) value of mmu_invalidate_seq is observed. */ smp_rmb(); - return kvm->mmu_notifier_seq != mmu_seq; + return kvm->mmu_invalidate_seq != mmu_seq; } static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) @@ -155,7 +156,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) gpc->valid = false; do { - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); write_unlock_irq(&gpc->lock); |