diff options
41 files changed, 1017 insertions, 903 deletions
diff --git a/Documentation/virt/kvm/devices/xics.rst b/Documentation/virt/kvm/devices/xics.rst index 2d6927e0b776..bf32c77174ab 100644 --- a/Documentation/virt/kvm/devices/xics.rst +++ b/Documentation/virt/kvm/devices/xics.rst @@ -22,7 +22,7 @@ Groups: Errors: ======= ========================================== - -EINVAL Value greater than KVM_MAX_VCPU_ID. + -EINVAL Value greater than KVM_MAX_VCPU_IDS. -EFAULT Invalid user pointer for attr->addr. -EBUSY A vcpu is already connected to the device. ======= ========================================== diff --git a/Documentation/virt/kvm/devices/xive.rst b/Documentation/virt/kvm/devices/xive.rst index 8bdf3dc38f01..8b5e7b40bdf8 100644 --- a/Documentation/virt/kvm/devices/xive.rst +++ b/Documentation/virt/kvm/devices/xive.rst @@ -91,7 +91,7 @@ the legacy interrupt mode, referred as XICS (POWER7/8). Errors: ======= ========================================== - -EINVAL Value greater than KVM_MAX_VCPU_ID. + -EINVAL Value greater than KVM_MAX_VCPU_IDS. -EFAULT Invalid user pointer for attr->addr. -EBUSY A vCPU is already connected to the device. ======= ========================================== diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f8be56d5342b..369c30e28301 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -771,7 +771,6 @@ int kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); -void kvm_arch_free_vm(struct kvm *kvm); int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index fe102cd2e518..7838e9fb693e 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -297,14 +297,6 @@ struct kvm *kvm_arch_alloc_vm(void) return vzalloc(sizeof(struct kvm)); } -void kvm_arch_free_vm(struct kvm *kvm) -{ - if (!has_vhe()) - kfree(kvm); - else - vfree(kvm); -} - int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 75c6f264c626..562aa878b266 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1073,7 +1073,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_VCPUS; break; case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; + r = KVM_MAX_VCPU_IDS; break; case KVM_CAP_MIPS_FPU: /* We don't handle systems with inconsistent cpu_has_fpu */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index caaa0f592d8e..3d31f2c59e43 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -434,7 +434,7 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu); #define SPLIT_HACK_OFFS 0xfb000000 /* - * This packs a VCPU ID from the [0..KVM_MAX_VCPU_ID) space down to the + * This packs a VCPU ID from the [0..KVM_MAX_VCPU_IDS) space down to the * [0..KVM_MAX_VCPUS) space, using knowledge of the guest's core stride * (but not its actual threading mode, which is not available) to avoid * collisions. diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 080a7feb7731..59cb38b04ede 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -33,11 +33,11 @@ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #include <asm/kvm_book3s_asm.h> /* for MAX_SMT_THREADS */ -#define KVM_MAX_VCPU_ID (MAX_SMT_THREADS * KVM_MAX_VCORES) +#define KVM_MAX_VCPU_IDS (MAX_SMT_THREADS * KVM_MAX_VCORES) #define KVM_MAX_NESTED_GUESTS KVMPPC_NR_LPIDS #else -#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS +#define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #define __KVM_HAVE_ARCH_INTC_INITIALIZED diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index a18db9e16ea4..225008882958 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1928,7 +1928,7 @@ int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr) pr_devel("%s nr_servers=%u\n", __func__, nr_servers); - if (!nr_servers || nr_servers > KVM_MAX_VCPU_ID) + if (!nr_servers || nr_servers > KVM_MAX_VCPU_IDS) return -EINVAL; mutex_lock(&xive->lock); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b4e6f70b97b9..8ab90ce8738f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -649,7 +649,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_VCPUS; break; case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; + r = KVM_MAX_VCPU_IDS; break; #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_PPC_GET_SMMU_INFO: diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f8f48a7ec577..5271fce6cd65 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -50,7 +50,7 @@ * so ratio of 4 should be enough. */ #define KVM_VCPU_ID_RATIO 4 -#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) +#define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 @@ -407,6 +407,7 @@ struct kvm_mmu_root_info { #define KVM_HAVE_MMU_RWLOCK struct kvm_mmu_page; +struct kvm_page_fault; /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, @@ -416,8 +417,7 @@ struct kvm_mmu_page; struct kvm_mmu { unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); - int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err, - bool prefault); + int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa, @@ -581,7 +581,6 @@ struct kvm_vcpu_hv { struct kvm_hyperv_exit exit; struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); - cpumask_t tlb_flush; bool enforce_cpuid; struct { u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */ @@ -1212,6 +1211,14 @@ struct kvm_arch { */ bool memslots_have_rmaps; + /* + * Set when the KVM mmu needs guest write access page tracking. If + * set, the necessary gfn_track arrays have been allocated for + * all memslots and should be allocated for any newly created or + * modified memslots. + */ + bool memslots_mmu_write_tracking; + #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; @@ -1541,6 +1548,8 @@ static inline struct kvm *kvm_arch_alloc_vm(void) { return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } + +#define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB @@ -1715,9 +1724,6 @@ void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t gfn, void *data, int offset, int len, - u32 access); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); @@ -1866,7 +1872,6 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); -void kvm_make_mclock_inprogress_request(struct kvm *kvm); void kvm_make_scan_ioapic_request(struct kvm *kvm); void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index 6a5f3acf2b33..79d84a94f8eb 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -49,8 +49,11 @@ struct kvm_page_track_notifier_node { int kvm_page_track_init(struct kvm *kvm); void kvm_page_track_cleanup(struct kvm *kvm); +int kvm_page_track_enable_mmu_write_tracking(struct kvm *kvm); + void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); -int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages); void kvm_slot_page_track_add_page(struct kvm *kvm, @@ -59,8 +62,9 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, void kvm_slot_page_track_remove_page(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode); -bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, - enum kvm_page_track_mode mode); +bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu, + struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode); void kvm_page_track_register_notifier(struct kvm *kvm, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ac69894eab88..619186138176 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -129,4 +129,7 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. +config KVM_EXTERNAL_WRITE_TRACKING + bool + endif # VIRTUALIZATION diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 751aa85a3001..2d70edb0f323 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -53,9 +53,16 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) return ret; } +/* + * This one is tied to SSB in the user API, and not + * visible in /proc/cpuinfo. + */ +#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ + #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) + static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) { @@ -500,7 +507,8 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_8000_0008_EBX, F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) + F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | + __feature_bit(KVM_X86_FEATURE_PSFD) ); /* diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index d5124b520f76..6f11cda2bfa4 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1754,7 +1754,6 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool int i; gpa_t gpa; struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; @@ -1836,18 +1835,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool } } - cpumask_clear(&hv_vcpu->tlb_flush); - - vcpu_mask = all_cpus ? NULL : - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); - /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, - NULL, vcpu_mask, &hv_vcpu->tlb_flush); + if (all_cpus) { + kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST); + } else { + vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, + vp_bitmap, vcpu_bitmap); + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, + vcpu_mask); + } ret_success: /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 8c065da73f8e..816a82515dcd 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1); + bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index bbd4a5d18b5d..e66e620c3bed 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -39,13 +39,13 @@ struct kvm_vcpu; struct dest_map { /* vcpu bitmap where IRQ has been sent */ - DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1); + DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS); /* * Vector sent to a given vcpu, only valid when * the vcpu's bit in map is set */ - u8 vectors[KVM_MAX_VCPU_ID + 1]; + u8 vectors[KVM_MAX_VCPU_IDS]; }; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e9688a9f7b57..75367af1a6d3 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -114,17 +114,91 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) vcpu->arch.mmu->shadow_root_level); } -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault); +struct kvm_page_fault { + /* arguments to kvm_mmu_do_page_fault. */ + const gpa_t addr; + const u32 error_code; + const bool prefault; + + /* Derived from error_code. */ + const bool exec; + const bool write; + const bool present; + const bool rsvd; + const bool user; + + /* Derived from mmu and global state. */ + const bool is_tdp; + const bool nx_huge_page_workaround_enabled; + + /* + * Whether a >4KB mapping can be created or is forbidden due to NX + * hugepages. + */ + bool huge_page_disallowed; + + /* + * Maximum page size that can be created for this fault; input to + * FNAME(fetch), __direct_map and kvm_tdp_mmu_map. + */ + u8 max_level; + + /* + * Page size that can be created based on the max_level and the + * page size used by the host mapping. + */ + u8 req_level; + + /* + * Page size that will be created based on the req_level and + * huge_page_disallowed. + */ + u8 goal_level; + + /* Shifted addr, or result of guest page table walk if addr is a gva. */ + gfn_t gfn; + + /* The memslot containing gfn. May be NULL. */ + struct kvm_memory_slot *slot; + + /* Outputs of kvm_faultin_pfn. */ + kvm_pfn_t pfn; + hva_t hva; + bool map_writable; +}; + +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); + +extern int nx_huge_pages; +static inline bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err, bool prefault) { + struct kvm_page_fault fault = { + .addr = cr2_or_gpa, + .error_code = err, + .exec = err & PFERR_FETCH_MASK, + .write = err & PFERR_WRITE_MASK, + .present = err & PFERR_PRESENT_MASK, + .rsvd = err & PFERR_RSVD_MASK, + .user = err & PFERR_USER_MASK, + .prefault = prefault, + .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + + .max_level = KVM_MAX_HUGEPAGE_LEVEL, + .req_level = PG_LEVEL_4K, + .goal_level = PG_LEVEL_4K, + }; #ifdef CONFIG_RETPOLINE - if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault)) - return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault); + if (fault.is_tdp) + return kvm_tdp_page_fault(vcpu, &fault); #endif - return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault); + return vcpu->arch.mmu->page_fault(vcpu, &fault); } /* diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1a64ba5b9437..24a9f4c3f5e7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1071,20 +1071,6 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) return kvm_mmu_memory_cache_nr_free_objects(mc); } -static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) -{ - struct kvm_memory_slot *slot; - struct kvm_mmu_page *sp; - struct kvm_rmap_head *rmap_head; - - sp = sptep_to_sp(spte); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - return pte_list_add(vcpu, spte, rmap_head); -} - - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_memslots *slots; @@ -1097,9 +1083,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); /* - * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the - * context of a vCPU so have to determine which memslots to use based - * on context information in sp->role. + * Unlike rmap_add, rmap_remove does not run in the context of a vCPU + * so we have to determine which memslots to use based on context + * information in sp->role. */ slots = kvm_memslots_for_spte_role(kvm, sp->role); @@ -1639,19 +1625,23 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn) { - struct kvm_memory_slot *slot; - struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; + struct kvm_rmap_head *rmap_head; + int rmap_count; sp = sptep_to_sp(spte); - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); + rmap_count = pte_list_add(vcpu, spte, rmap_head); - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) { + kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); + kvm_flush_remote_tlbs_with_address( + vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + } } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1795,7 +1785,7 @@ static void mark_unsync(u64 *spte) static int nonpaging_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - return 0; + return -1; } #define KVM_PAGE_ARRAY_NR 16 @@ -1909,12 +1899,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { + int ret = vcpu->arch.mmu->sync_page(vcpu, sp); + + if (ret < 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } - return true; + return !!ret; } static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, @@ -1931,17 +1923,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, return true; } -static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, - struct list_head *invalid_list, - bool remote_flush, bool local_flush) -{ - if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush)) - return; - - if (local_flush) - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); -} - #ifdef CONFIG_KVM_MMU_AUDIT #include "mmu_audit.c" #else @@ -2044,7 +2025,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, protected |= rmap_write_protect(vcpu, sp->gfn); if (protected) { - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); flush = false; } @@ -2054,7 +2035,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, mmu_pages_clear_parents(&parents); } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { - kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); if (!can_yield) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); return -EINTR; @@ -2065,7 +2046,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, } } - kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); return 0; } @@ -2149,7 +2130,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, break; WARN_ON(!list_empty(&invalid_list)); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); } __clear_sp_write_flooding_count(sp); @@ -2229,7 +2210,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, u64 spte) { - if (is_last_spte(spte, iterator->level)) { + if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) { iterator->level = 0; return; } @@ -2591,7 +2572,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must * be write-protected. */ -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + gfn_t gfn, bool can_unsync, bool speculative) { struct kvm_mmu_page *sp; bool locked = false; @@ -2601,7 +2583,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(vcpu, slot, gfn, KVM_PAGE_TRACK_WRITE)) return -EPERM; /* @@ -2617,6 +2599,9 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) if (sp->unsync) continue; + if (speculative) + return -EEXIST; + /* * TDP MMU page faults require an additional spinlock as they * run with mmu_lock held for read, not write, and the unsync @@ -2688,40 +2673,22 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) return 0; } -static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, bool speculative, - bool can_unsync, bool host_writable) -{ - u64 spte; - struct kvm_mmu_page *sp; - int ret; - - sp = sptep_to_sp(sptep); - - ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, - can_unsync, host_writable, sp_ad_disabled(sp), &spte); - - if (spte & PT_WRITABLE_MASK) - kvm_vcpu_mark_page_dirty(vcpu, gfn); - - if (*sptep == spte) - ret |= SET_SPTE_SPURIOUS; - else if (mmu_spte_update(sptep, spte)) - ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; - return ret; -} - -static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, bool write_fault, int level, - gfn_t gfn, kvm_pfn_t pfn, bool speculative, - bool host_writable) +static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + u64 *sptep, unsigned int pte_access, gfn_t gfn, + kvm_pfn_t pfn, struct kvm_page_fault *fault) { + struct kvm_mmu_page *sp = sptep_to_sp(sptep); + int level = sp->role.level; int was_rmapped = 0; - int rmap_count; - int set_spte_ret; int ret = RET_PF_FIXED; bool flush = false; + bool wrprot; + u64 spte; + + /* Prefetching always gets a writable pfn. */ + bool host_writable = !fault || fault->map_writable; + bool speculative = !fault || fault->prefault; + bool write_fault = fault && fault->write; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2752,52 +2719,36 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, was_rmapped = 1; } - set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn, - speculative, true, host_writable); - if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { + wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, speculative, + true, host_writable, &spte); + + if (*sptep == spte) { + ret = RET_PF_SPURIOUS; + } else { + trace_kvm_mmu_set_spte(level, gfn, sptep); + flush |= mmu_spte_update(sptep, spte); + } + + if (wrprot) { if (write_fault) ret = RET_PF_EMULATE; - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) + if (flush) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, KVM_PAGES_PER_HPAGE(level)); - /* - * The fault is fully spurious if and only if the new SPTE and old SPTE - * are identical, and emulation is not required. - */ - if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) { - WARN_ON_ONCE(!was_rmapped); - return RET_PF_SPURIOUS; - } - pgprintk("%s: setting spte %llx\n", __func__, *sptep); - trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped) { + WARN_ON_ONCE(ret == RET_PF_SPURIOUS); kvm_update_page_stats(vcpu->kvm, level, 1); - rmap_count = rmap_add(vcpu, sptep, gfn); - if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, sptep, gfn); + rmap_add(vcpu, slot, sptep, gfn); } return ret; } -static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, - bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); - if (!slot) - return KVM_PFN_ERR_FAULT; - - return gfn_to_pfn_memslot_atomic(slot, gfn); -} - static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2818,8 +2769,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, - page_to_pfn(pages[i]), true, true); + mmu_set_spte(vcpu, slot, start, access, gfn, + page_to_pfn(pages[i]), NULL); put_page(pages[i]); } @@ -2842,11 +2793,13 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, if (!start) continue; if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) - break; + return; start = NULL; } else if (!start) start = spte; } + if (start) + direct_pte_prefetch_many(vcpu, sp, start, spte); } static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) @@ -2924,52 +2877,46 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, return min(host_level, max_level); } -int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level) +void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_memory_slot *slot; - kvm_pfn_t pfn = *pfnp; + struct kvm_memory_slot *slot = fault->slot; kvm_pfn_t mask; - int level; - *req_level = PG_LEVEL_4K; + fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled; - if (unlikely(max_level == PG_LEVEL_4K)) - return PG_LEVEL_4K; + if (unlikely(fault->max_level == PG_LEVEL_4K)) + return; - if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) - return PG_LEVEL_4K; + if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) + return; - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); - if (!slot) - return PG_LEVEL_4K; + if (kvm_slot_dirty_track_enabled(slot)) + return; /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); - if (level == PG_LEVEL_4K || huge_page_disallowed) - return PG_LEVEL_4K; + fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, + fault->gfn, fault->pfn, + fault->max_level); + if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) + return; /* * mmu_notifier_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - *pfnp = pfn & ~mask; - - return level; + fault->goal_level = fault->req_level; + mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1; + VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask)); + fault->pfn &= ~mask; } -void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, - kvm_pfn_t *pfnp, int *goal_levelp) +void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level) { - int level = *goal_levelp; - - if (cur_level == level && level > PG_LEVEL_4K && + if (cur_level > PG_LEVEL_4K && + cur_level == fault->goal_level && is_shadow_present_pte(spte) && !is_large_pte(spte)) { /* @@ -2979,42 +2926,33 @@ void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, * patching back for them into pfn the next 9 bits of * the address. */ - u64 page_mask = KVM_PAGES_PER_HPAGE(level) - - KVM_PAGES_PER_HPAGE(level - 1); - *pfnp |= gfn & page_mask; - (*goal_levelp)--; + u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - + KVM_PAGES_PER_HPAGE(cur_level - 1); + fault->pfn |= fault->gfn & page_mask; + fault->goal_level--; } } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault, bool is_tdp) +static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int level, req_level, ret; - gfn_t gfn = gpa >> PAGE_SHIFT; - gfn_t base_gfn = gfn; + int ret; + gfn_t base_gfn = fault->gfn; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(gpa, level, pfn); - for_each_shadow_entry(vcpu, gpa, it) { + trace_kvm_mmu_spte_requested(fault); + for_each_shadow_entry(vcpu, fault->addr, it) { /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(*it.sptep, gfn, it.level, - &pfn, &level); + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == level) + base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == fault->goal_level) break; drop_large_spte(vcpu, it.sptep); @@ -3025,14 +2963,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (is_tdp && huge_page_disallowed && - req_level >= it.level) + if (fault->is_tdp && fault->huge_page_disallowed && + fault->req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } - ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, - write, level, base_gfn, pfn, prefault, - map_writable); + if (WARN_ON_ONCE(it.level != fault->goal_level)) + return -EFAULT; + + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, + base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -3064,18 +3004,19 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) return -EFAULT; } -static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, - kvm_pfn_t pfn, unsigned int access, - int *ret_val) +static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + unsigned int access, int *ret_val) { /* The pfn is invalid, report the error! */ - if (unlikely(is_error_pfn(pfn))) { - *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); + if (unlikely(is_error_pfn(fault->pfn))) { + *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); return true; } - if (unlikely(is_noslot_pfn(pfn))) { - vcpu_cache_mmio_info(vcpu, gva, gfn, + if (unlikely(!fault->slot)) { + gva_t gva = fault->is_tdp ? 0 : fault->addr; + + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); /* * If MMIO caching is disabled, emulate immediately without @@ -3091,18 +3032,17 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, return false; } -static bool page_fault_can_be_fast(u32 error_code) +static bool page_fault_can_be_fast(struct kvm_page_fault *fault) { /* * Do not fix the mmio spte with invalid generation number which * need to be updated by slow page fault path. */ - if (unlikely(error_code & PFERR_RSVD_MASK)) + if (fault->rsvd) return false; /* See if the page fault is due to an NX violation */ - if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) - == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) + if (unlikely(fault->exec && fault->present)) return false; /* @@ -3119,9 +3059,7 @@ static bool page_fault_can_be_fast(u32 error_code) * accesses to a present page. */ - return shadow_acc_track_mask != 0 || - ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) - == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); + return shadow_acc_track_mask != 0 || (fault->write && fault->present); } /* @@ -3129,13 +3067,9 @@ static bool page_fault_can_be_fast(u32 error_code) * someone else modified the SPTE from its original value. */ static bool -fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, u64 *sptep, u64 old_spte, u64 new_spte) { - gfn_t gfn; - - WARN_ON(!sp->role.direct); - /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in @@ -3151,24 +3085,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) return false; - if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) { - /* - * The gfn of direct spte is stable since it is - * calculated by sp->gfn. - */ - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } + if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) + mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn); return true; } -static bool is_access_allowed(u32 fault_err_code, u64 spte) +static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) { - if (fault_err_code & PFERR_FETCH_MASK) + if (fault->exec) return is_executable_pte(spte); - if (fault_err_code & PFERR_WRITE_MASK) + if (fault->write) return is_writable_pte(spte); /* Fault was on Read access */ @@ -3193,9 +3121,6 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { sptep = iterator.sptep; *spte = old_spte; - - if (!is_shadow_present_pte(old_spte)) - break; } return sptep; @@ -3204,7 +3129,7 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) /* * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) +static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; @@ -3212,7 +3137,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) u64 *sptep = NULL; uint retry_count = 0; - if (!page_fault_can_be_fast(error_code)) + if (!page_fault_can_be_fast(fault)) return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3221,9 +3146,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) u64 new_spte; if (is_tdp_mmu(vcpu->arch.mmu)) - sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); else - sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); + sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); if (!is_shadow_present_pte(spte)) break; @@ -3242,7 +3167,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * Need not check the access of upper level table entries since * they are always ACC_ALL. */ - if (is_access_allowed(error_code, spte)) { + if (is_access_allowed(fault, spte)) { ret = RET_PF_SPURIOUS; break; } @@ -3257,7 +3182,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * be removed in the fast path only if the SPTE was * write-protected for dirty-logging or access tracking. */ - if ((error_code & PFERR_WRITE_MASK) && + if (fault->write && spte_can_locklessly_be_made_writable(spte)) { new_spte |= PT_WRITABLE_MASK; @@ -3278,7 +3203,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) /* Verify that the fault can be handled in the fast path */ if (new_spte == spte || - !is_access_allowed(error_code, new_spte)) + !is_access_allowed(fault, new_spte)) break; /* @@ -3286,7 +3211,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3299,7 +3224,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) } while (true); - trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); + trace_fast_page_fault(vcpu, fault, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3506,6 +3431,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (r) return r; + r = kvm_page_track_enable_mmu_write_tracking(vcpu->kvm); + if (r) + return r; + write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) @@ -3763,9 +3692,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf] = spte; - - if (!is_shadow_present_pte(spte)) - break; } return leaf; @@ -3856,20 +3782,19 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) } static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, - u32 error_code, gfn_t gfn) + struct kvm_page_fault *fault) { - if (unlikely(error_code & PFERR_RSVD_MASK)) + if (unlikely(fault->rsvd)) return false; - if (!(error_code & PFERR_PRESENT_MASK) || - !(error_code & PFERR_WRITE_MASK)) + if (!fault->present || !fault->write) return false; /* * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(vcpu, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) return true; return false; @@ -3881,11 +3806,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) u64 spte; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) clear_sp_write_flooding_count(iterator.sptep); - if (!is_shadow_present_pte(spte)) - break; - } walk_shadow_page_lockless_end(vcpu); } @@ -3903,11 +3825,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, - bool write, bool *writable, int *r) +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r) { - struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + struct kvm_memory_slot *slot = fault->slot; bool async; /* @@ -3921,8 +3841,9 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!kvm_is_visible_memslot(slot)) { /* Don't expose private memslots to L2. */ if (is_guest_mode(vcpu)) { - *pfn = KVM_PFN_NOSLOT; - *writable = false; + fault->slot = NULL; + fault->pfn = KVM_PFN_NOSLOT; + fault->map_writable = false; return false; } /* @@ -3939,46 +3860,45 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, - write, writable, hva); + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, + fault->write, &fault->map_writable, + &fault->hva); if (!async) return false; /* *pfn has correct page already */ - if (!prefault && kvm_can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(cr2_or_gpa, gfn); - if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); + if (!fault->prefault && kvm_can_do_async_pf(vcpu)) { + trace_kvm_try_async_get_page(fault->addr, fault->gfn); + if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { + trace_kvm_async_pf_doublefault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); goto out_retry; - } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) + } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) goto out_retry; } - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, - write, writable, hva); + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, + fault->write, &fault->map_writable, + &fault->hva); out_retry: *r = RET_PF_RETRY; return true; } -static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault, int max_level, bool is_tdp) +static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); - bool write = error_code & PFERR_WRITE_MASK; - bool map_writable; - gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; - kvm_pfn_t pfn; - hva_t hva; int r; - if (page_fault_handle_page_track(vcpu, error_code, gfn)) + fault->gfn = fault->addr >> PAGE_SHIFT; + fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); + + if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; - r = fast_page_fault(vcpu, gpa, error_code); + r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) return r; @@ -3989,11 +3909,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, - write, &map_writable, &r)) + if (kvm_faultin_pfn(vcpu, fault, &r)) return r; - if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) + if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r)) return r; r = RET_PF_RETRY; @@ -4003,36 +3922,34 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, else write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) + if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; if (is_tdp_mmu_fault) - r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, - pfn, prefault); + r = kvm_tdp_mmu_map(vcpu, fault); else - r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, - prefault, is_tdp); + r = __direct_map(vcpu, fault); out_unlock: if (is_tdp_mmu_fault) read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(fault->pfn); return r; } -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, - u32 error_code, bool prefault) +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { - pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code); /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ - return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, - PG_LEVEL_2M, false); + fault->max_level = PG_LEVEL_2M; + return direct_page_fault(vcpu, fault); } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, @@ -4068,23 +3985,19 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault) +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - int max_level; - - for (max_level = KVM_MAX_HUGEPAGE_LEVEL; - max_level > PG_LEVEL_4K; - max_level--) { - int page_num = KVM_PAGES_PER_HPAGE(max_level); - gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); + while (fault->max_level > PG_LEVEL_4K) { + int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); + gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; + + --fault->max_level; } - return direct_page_fault(vcpu, gpa, error_code, prefault, - max_level, true); + return direct_page_fault(vcpu, fault); } static void nonpaging_init_context(struct kvm_mmu *context) @@ -4205,7 +4118,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu) } static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - unsigned int access, int *nr_present) + unsigned int access) { if (unlikely(is_mmio_spte(*sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { @@ -4213,7 +4126,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, return true; } - (*nr_present)++; mark_mmio_spte(vcpu, sptep, gfn, access); return true; } @@ -5212,7 +5124,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; - bool remote_flush, local_flush; + bool flush = false; /* * If we don't have indirect shadow pages, it means no page is @@ -5221,8 +5133,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return; - remote_flush = local_flush = false; - pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); /* @@ -5251,18 +5161,17 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!spte) continue; - local_flush = true; while (npte--) { entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && sp->role.level != PG_LEVEL_4K) ++vcpu->kvm->stat.mmu_pde_zapped; if (need_remote_flush(entry, *spte)) - remote_flush = true; + flush = true; ++spte; } } - kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); write_unlock(&vcpu->kvm->mmu_lock); } @@ -5702,6 +5611,9 @@ void kvm_mmu_init_vm(struct kvm *kvm) */ kvm->arch.memslots_have_rmaps = true; + if (!tdp_enabled) + kvm->arch.memslots_mmu_write_tracking = true; + node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bf2bdbf333c2..585146a712d2 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -118,13 +118,8 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) kvm_x86_ops.cpu_dirty_log_size; } -extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) -{ - return READ_ONCE(nx_huge_pages); -} - -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + gfn_t gfn, bool can_unsync, bool speculative); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); @@ -155,19 +150,11 @@ enum { RET_PF_SPURIOUS, }; -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) -#define SET_SPTE_SPURIOUS BIT(2) - int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t pfn, int max_level); -int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level); -void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, - kvm_pfn_t *pfnp, int *goal_levelp); +void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); +void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 2924a4081a19..b8151bbca36a 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -252,9 +252,9 @@ TRACE_EVENT( TRACE_EVENT( fast_page_fault, - TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, + TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, u64 *sptep, u64 old_spte, int ret), - TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret), + TP_ARGS(vcpu, fault, sptep, old_spte, ret), TP_STRUCT__entry( __field(int, vcpu_id) @@ -268,8 +268,8 @@ TRACE_EVENT( TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->cr2_or_gpa = cr2_or_gpa; - __entry->error_code = error_code; + __entry->cr2_or_gpa = fault->addr; + __entry->error_code = fault->error_code; __entry->sptep = sptep; __entry->old_spte = old_spte; __entry->new_spte = *sptep; @@ -367,8 +367,8 @@ TRACE_EVENT( TRACE_EVENT( kvm_mmu_spte_requested, - TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), - TP_ARGS(addr, level, pfn), + TP_PROTO(struct kvm_page_fault *fault), + TP_ARGS(fault), TP_STRUCT__entry( __field(u64, gfn) @@ -377,9 +377,9 @@ TRACE_EVENT( ), TP_fast_assign( - __entry->gfn = addr >> PAGE_SHIFT; - __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); - __entry->level = level; + __entry->gfn = fault->gfn; + __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1)); + __entry->level = fault->goal_level; ), TP_printk("gfn %llx pfn %llx level %d", diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 21427e84a82e..bb5d60bd4dbf 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -19,6 +19,16 @@ #include "mmu.h" #include "mmu_internal.h" +static bool write_tracking_enabled(struct kvm *kvm) +{ + /* + * Read memslots_mmu_write_tracking before gfn_track pointers. Pairs + * with smp_store_release in kvm_page_track_enable_mmu_write_tracking. + */ + return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) || + smp_load_acquire(&kvm->arch.memslots_mmu_write_tracking); +} + void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { int i; @@ -29,12 +39,16 @@ void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) } } -int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages) { - int i; + int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { + if (i == KVM_PAGE_TRACK_WRITE && !write_tracking_enabled(kvm)) + continue; + slot->arch.gfn_track[i] = kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL_ACCOUNT); @@ -57,6 +71,46 @@ static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode) return true; } +int kvm_page_track_enable_mmu_write_tracking(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + unsigned short **gfn_track; + int i; + + if (write_tracking_enabled(kvm)) + return 0; + + mutex_lock(&kvm->slots_arch_lock); + + if (write_tracking_enabled(kvm)) { + mutex_unlock(&kvm->slots_arch_lock); + return 0; + } + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, slots) { + gfn_track = slot->arch.gfn_track + KVM_PAGE_TRACK_WRITE; + *gfn_track = kvcalloc(slot->npages, sizeof(*gfn_track), + GFP_KERNEL_ACCOUNT); + if (*gfn_track == NULL) { + mutex_unlock(&kvm->slots_arch_lock); + return -ENOMEM; + } + } + } + + /* + * Ensure that memslots_mmu_write_tracking becomes true strictly + * after all the pointers are set. + */ + smp_store_release(&kvm->arch.memslots_mmu_write_tracking, true); + mutex_unlock(&kvm->slots_arch_lock); + + return 0; +} + static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode, short count) { @@ -92,6 +146,10 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, if (WARN_ON(!page_track_mode_is_valid(mode))) return; + if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && + !write_tracking_enabled(kvm))) + return; + update_gfn_track(slot, gfn, mode, 1); /* @@ -126,6 +184,10 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, if (WARN_ON(!page_track_mode_is_valid(mode))) return; + if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && + !write_tracking_enabled(kvm))) + return; + update_gfn_track(slot, gfn, mode, -1); /* @@ -139,19 +201,21 @@ EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. */ -bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, - enum kvm_page_track_mode mode) +bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu, + struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode) { - struct kvm_memory_slot *slot; int index; if (WARN_ON(!page_track_mode_is_valid(mode))) return false; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot) return false; + if (mode == KVM_PAGE_TRACK_WRITE && !write_tracking_enabled(vcpu->kvm)) + return false; + index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); return !!READ_ONCE(slot->arch.gfn_track[mode][index]); } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 913d52a7923e..d8889e02c4b7 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -561,6 +561,7 @@ static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte, bool no_dirty_log) { + struct kvm_memory_slot *slot; unsigned pte_access; gfn_t gfn; kvm_pfn_t pfn; @@ -573,30 +574,21 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); - if (is_error_pfn(pfn)) + if (!slot) return false; - /* - * we call mmu_set_spte() with host_writable = true because - * pte_prefetch_gfn_to_pfn always gets a writable pfn. - */ - mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn, - true, true); + pfn = gfn_to_pfn_memslot_atomic(slot, gfn); + if (is_error_pfn(pfn)) + return false; + mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL); kvm_release_pfn_clean(pfn); return true; } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) -{ - pt_element_t gpte = *(const pt_element_t *)pte; - - FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); -} - static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, struct guest_walker *gw, int level) { @@ -663,21 +655,16 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If the guest tries to write a write-protected page, we need to * emulate this operation, return 1 to indicate this case. */ -static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, - struct guest_walker *gw, u32 error_code, - int max_level, kvm_pfn_t pfn, bool map_writable, - bool prefault) +static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + struct guest_walker *gw) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write_fault = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned int direct_access, access; - int top_level, level, req_level, ret; - gfn_t base_gfn = gw->gfn; + int top_level, ret; + gfn_t base_gfn = fault->gfn; + WARN_ON_ONCE(gw->gfn != base_gfn); direct_access = gw->pte_access; top_level = vcpu->arch.mmu->root_level; @@ -695,7 +682,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) goto out_gpte_changed; - for (shadow_walk_init(&it, vcpu, addr); + for (shadow_walk_init(&it, vcpu, fault->addr); shadow_walk_okay(&it) && it.level > gw->level; shadow_walk_next(&it)) { gfn_t table_gfn; @@ -707,7 +694,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, + sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, it.level-1, false, access); /* * We must synchronize the pagetable before linking it @@ -741,10 +728,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(addr, gw->level, pfn); + trace_kvm_mmu_spte_requested(fault); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); @@ -753,12 +739,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level, - &pfn, &level); + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == level) + base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == fault->goal_level) break; validate_direct_spte(vcpu, it.sptep, direct_access); @@ -766,16 +751,20 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, drop_large_spte(vcpu, it.sptep); if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, addr, + sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); - if (huge_page_disallowed && req_level >= it.level) + if (fault->huge_page_disallowed && + fault->req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } - ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, base_gfn, pfn, prefault, map_writable); + if (WARN_ON_ONCE(it.level != fault->goal_level)) + return -EFAULT; + + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access, + base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -841,45 +830,40 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, - bool prefault) +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool write_fault = error_code & PFERR_WRITE_MASK; - bool user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; - kvm_pfn_t pfn; - hva_t hva; unsigned long mmu_seq; - bool map_writable, is_self_change_mapping; - int max_level; + bool is_self_change_mapping; - pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); + pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); + WARN_ON_ONCE(fault->is_tdp); /* + * Look up the guest pte for the faulting address. * If PFEC.RSVD is set, this is a shadow page fault. * The bit needs to be cleared before walking guest page tables. */ - error_code &= ~PFERR_RSVD_MASK; - - /* - * Look up the guest pte for the faulting address. - */ - r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); + r = FNAME(walk_addr)(&walker, vcpu, fault->addr, + fault->error_code & ~PFERR_RSVD_MASK); /* * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - if (!prefault) + if (!fault->prefault) kvm_inject_emulated_page_fault(vcpu, &walker.fault); return RET_PF_RETRY; } - if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { - shadow_page_table_clear_flood(vcpu, addr); + fault->gfn = walker.gfn; + fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); + + if (page_fault_handle_page_track(vcpu, fault)) { + shadow_page_table_clear_flood(vcpu, fault->addr); return RET_PF_EMULATE; } @@ -890,29 +874,28 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, vcpu->arch.write_fault_to_shadow_pgtable = false; is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, - &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); + &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable); if (is_self_change_mapping) - max_level = PG_LEVEL_4K; + fault->max_level = PG_LEVEL_4K; else - max_level = walker.level; + fault->max_level = walker.level; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva, - write_fault, &map_writable, &r)) + if (kvm_faultin_pfn(vcpu, fault, &r)) return r; - if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) + if (handle_abnormal_pfn(vcpu, fault, walker.pte_access, &r)) return r; /* * Do not change pte_access if the pfn is a mmio page, otherwise * we will cache the incorrect access into mmio spte. */ - if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && - !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) { + if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && + !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { walker.pte_access |= ACC_WRITE_MASK; walker.pte_access &= ~ACC_USER_MASK; @@ -928,20 +911,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) + if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn, - map_writable, prefault); + r = FNAME(fetch)(vcpu, fault, &walker); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(fault->pfn); return r; } @@ -1007,10 +989,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) sizeof(pt_element_t))) break; - FNAME(update_pte)(vcpu, sp, sptep, &gpte); + FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); } - if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) + if (!sp->unsync_children) break; } write_unlock(&vcpu->kvm->mmu_lock); @@ -1066,14 +1048,19 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. + * + * Returns + * < 0: the sp should be zapped + * 0: the sp is synced and no tlb flushing is required + * > 0: the sp is synced and tlb flushing is required */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base; - int i, nr_present = 0; + int i; bool host_writable; gpa_t first_pte_gpa; - int set_spte_ret = 0; + bool flush = false; /* * Ignore various flags when verifying that it's safe to sync a shadow @@ -1098,11 +1085,13 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) */ if (WARN_ON_ONCE(sp->role.direct || (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word)) - return 0; + return -1; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + u64 *sptep, spte; + struct kvm_memory_slot *slot; unsigned pte_access; pt_element_t gpte; gpa_t pte_gpa; @@ -1115,10 +1104,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, sizeof(pt_element_t))) - return 0; + return -1; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; + flush = true; continue; } @@ -1127,30 +1116,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_access &= FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, - &nr_present)) + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) continue; if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; + flush = true; continue; } - nr_present++; - - host_writable = sp->spt[i] & shadow_host_writable_mask; + sptep = &sp->spt[i]; + spte = *sptep; + host_writable = spte & shadow_host_writable_mask; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + make_spte(vcpu, sp, slot, pte_access, gfn, + spte_to_pfn(spte), spte, true, false, + host_writable, &spte); - set_spte_ret |= set_spte(vcpu, &sp->spt[i], - pte_access, PG_LEVEL_4K, - gfn, spte_to_pfn(sp->spt[i]), - true, false, host_writable); + flush |= mmu_spte_update(sptep, spte); } - if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) - kvm_flush_remote_tlbs(vcpu->kvm); - - return nr_present; + return flush; } #undef pt_element_t diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 3e97cdb13eb7..871f6114b0fa 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -89,15 +89,17 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) E820_TYPE_RAM); } -int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, - bool can_unsync, bool host_writable, bool ad_disabled, - u64 *new_spte) +bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct kvm_memory_slot *slot, + unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, + u64 old_spte, bool speculative, bool can_unsync, + bool host_writable, u64 *new_spte) { + int level = sp->role.level; u64 spte = SPTE_MMU_PRESENT_MASK; - int ret = 0; + bool wrprot = false; - if (ad_disabled) + if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; @@ -150,7 +152,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writable_pte(old_spte)) + if (is_writable_pte(old_spte)) goto out; /* @@ -159,10 +161,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. */ - if (mmu_try_to_unsync_pages(vcpu, gfn, can_unsync)) { + if (mmu_try_to_unsync_pages(vcpu, slot, gfn, can_unsync, speculative)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); - ret |= SET_SPTE_WRITE_PROTECTED_PT; + wrprot = true; pte_access &= ~ACC_WRITE_MASK; spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); } @@ -171,16 +173,22 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, if (pte_access & ACC_WRITE_MASK) spte |= spte_shadow_dirty_mask(spte); +out: if (speculative) spte = mark_spte_for_access_track(spte); -out: WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { + /* Enforced by kvm_mmu_hugepage_adjust. */ + WARN_ON(level > PG_LEVEL_4K); + mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); + } + *new_spte = spte; - return ret; + return wrprot; } u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index eb7b227fc6cf..7c0b09461349 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -334,15 +334,11 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) -#define SET_SPTE_SPURIOUS BIT(2) - -int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, - bool can_unsync, bool host_writable, bool ad_disabled, - u64 *new_spte); +bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct kvm_memory_slot *slot, + unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, + u64 old_spte, bool speculative, bool can_unsync, + bool host_writable, u64 *new_spte); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 64ccfc1fa553..953f24ded6bc 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -167,6 +167,7 @@ static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, role.direct = true; role.gpte_is_8_bytes = true; role.access = ACC_ALL; + role.ad_disabled = !shadow_accessed_mask; return role; } @@ -489,8 +490,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, } /* - * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically - * and handle the associated bookkeeping, but do not mark the page dirty + * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically + * and handle the associated bookkeeping. Do not mark the page dirty * in KVM's dirty bitmaps. * * @kvm: kvm instance @@ -499,9 +500,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * Returns: true if the SPTE was set, false if it was not. If false is returned, * this function will have no side-effects. */ -static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { lockdep_assert_held_read(&kvm->mmu_lock); @@ -527,43 +528,6 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, return true; } -/* - * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a - * TDP page fault. - * - * @vcpu: The vcpu instance that took the TDP page fault. - * @iter: a tdp_iter instance currently on the SPTE that should be set - * @new_spte: The value the SPTE should be set to - * - * Returns: true if the SPTE was set, false if it was not. If false is returned, - * this function will have no side-effects. - */ -static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu, - struct tdp_iter *iter, - u64 new_spte) -{ - struct kvm *kvm = vcpu->kvm; - - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) - return false; - - /* - * Use kvm_vcpu_gfn_to_memslot() instead of going through - * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot. - */ - if (is_writable_pte(new_spte)) { - struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn); - - if (slot && kvm_slot_dirty_track_enabled(slot)) { - /* Enforced by kvm_mmu_hugepage_adjust. */ - WARN_ON_ONCE(iter->level > PG_LEVEL_4K); - mark_page_dirty_in_slot(kvm, slot, iter->gfn); - } - } - - return true; -} - static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, struct tdp_iter *iter) { @@ -573,7 +537,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE)) + if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -929,26 +893,26 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) * Installs a last-level SPTE to handle a TDP page fault. * (NPT/EPT violation/misconfiguration) */ -static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, - int map_writable, - struct tdp_iter *iter, - kvm_pfn_t pfn, bool prefault) +static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + struct tdp_iter *iter) { + struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep); u64 new_spte; int ret = RET_PF_FIXED; - int make_spte_ret = 0; + bool wrprot = false; - if (unlikely(is_noslot_pfn(pfn))) + WARN_ON(sp->role.level != fault->goal_level); + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else - make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, - pfn, iter->old_spte, prefault, true, - map_writable, !shadow_accessed_mask, - &new_spte); + wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, + fault->pfn, iter->old_spte, fault->prefault, true, + fault->map_writable, &new_spte); if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte)) + else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; /* @@ -956,10 +920,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, * protected, emulation is needed. If the emulation was skipped, * the vCPU would have the same fault again. */ - if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { - if (write) + if (wrprot) { + if (fault->write) ret = RET_PF_EMULATE; - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ @@ -986,37 +949,26 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. */ -int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault) +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu *mmu = vcpu->arch.mmu; struct tdp_iter iter; struct kvm_mmu_page *sp; u64 *child_pt; u64 new_spte; int ret; - gfn_t gfn = gpa >> PAGE_SHIFT; - int level; - int req_level; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(gpa, level, pfn); + trace_kvm_mmu_spte_requested(fault); rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(iter.old_spte, gfn, - iter.level, &pfn, &level); + tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); - if (iter.level == level) + if (iter.level == fault->goal_level) break; /* @@ -1052,10 +1004,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) { + if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) { tdp_mmu_link_page(vcpu->kvm, sp, - huge_page_disallowed && - req_level >= iter.level); + fault->huge_page_disallowed && + fault->req_level >= iter.level); trace_kvm_mmu_get_page(sp, true); } else { @@ -1065,13 +1017,12 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, } } - if (iter.level != level) { + if (iter.level != fault->goal_level) { rcu_read_unlock(); return RET_PF_RETRY; } - ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, - pfn, prefault); + ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); rcu_read_unlock(); return ret; @@ -1241,8 +1192,7 @@ retry: new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, - new_spte)) { + if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { /* * The iter must explicitly re-read the SPTE because * the atomic cmpxchg failed. @@ -1310,8 +1260,7 @@ retry: continue; } - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, - new_spte)) { + if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { /* * The iter must explicitly re-read the SPTE because * the atomic cmpxchg failed. diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 358f447d4012..ceaf7ff3ca7c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -48,9 +48,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); -int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault); +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 510b833cbd39..f8b7bc04b3e7 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -238,6 +238,18 @@ static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size) kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1); } +static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl) +{ + /* Nested FLUSHBYASID is not supported yet. */ + switch(tlb_ctl) { + case TLB_CONTROL_DO_NOTHING: + case TLB_CONTROL_FLUSH_ALL_ASID: + return true; + default: + return false; + } +} + static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, struct vmcb_control_area *control) { @@ -257,6 +269,9 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, IOPM_SIZE))) return false; + if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl))) + return false; + return true; } @@ -538,8 +553,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(vcpu); - svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset = - vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + vcpu->arch.l1_tsc_offset, + svm->nested.ctl.tsc_offset, + svm->tsc_ratio_msr); + + svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; + + if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + WARN_ON(!svm->tsc_scaling_enabled); + nested_svm_update_tsc_ratio_msr(vcpu); + } svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | @@ -550,9 +574,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; - svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count; - svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh; - nested_svm_transition_tlb_flush(vcpu); /* Enter Guest-Mode */ @@ -810,11 +831,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; - vmcb12->control.pause_filter_count = - svm->vmcb->control.pause_filter_count; - vmcb12->control.pause_filter_thresh = - svm->vmcb->control.pause_filter_thresh; - nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); svm_switch_vmcb(svm, &svm->vmcb01); @@ -832,6 +848,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } + if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + WARN_ON(!svm->tsc_scaling_enabled); + vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; + svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + } + svm->nested.ctl.nested_cr3 = 0; /* @@ -1219,6 +1241,16 @@ int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_CONTINUE; } +void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->arch.tsc_scaling_ratio = + kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, + svm->tsc_ratio_msr); + svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); +} + static int svm_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, u32 user_data_size) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c36b5fe4c27c..1e8b26b93b4f 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2631,11 +2631,11 @@ void sev_es_init_vmcb(struct vcpu_svm *svm) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } -void sev_es_create_vcpu(struct vcpu_svm *svm) +void sev_es_vcpu_reset(struct vcpu_svm *svm) { /* - * Set the GHCB MSR value as per the GHCB specification when creating - * a vCPU for an SEV-ES guest. + * Set the GHCB MSR value as per the GHCB specification when emulating + * vCPU RESET for an SEV-ES guest. */ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, GHCB_VERSION_MIN, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 989685098b3e..89077160d463 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -186,6 +186,13 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); +/* enable/disable LBR virtualization */ +static int lbrv = true; +module_param(lbrv, int, 0444); + +static int tsc_scaling = true; +module_param(tsc_scaling, int, 0444); + /* * enable / disable AVIC. Because the defaults differ for APICv * support between VMX and SVM we cannot use module_param_named. @@ -466,7 +473,7 @@ static int has_svm(void) static void svm_hardware_disable(void) { /* Make sure we clean up behind us */ - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) + if (tsc_scaling) wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); cpu_svm_disable(); @@ -509,6 +516,10 @@ static int svm_hardware_enable(void) wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + /* + * Set the default value, even if we don't use TSC scaling + * to avoid having stale value in the msr + */ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); } @@ -929,6 +940,9 @@ static __init void svm_set_cpu_caps(void) if (npt_enabled) kvm_cpu_cap_set(X86_FEATURE_NPT); + if (tsc_scaling) + kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); + /* Nested VM can receive #VMEXIT instead of triggering #GP */ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } @@ -976,10 +990,15 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); - if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; + if (tsc_scaling) { + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + tsc_scaling = false; + } else { + pr_info("TSC scaling supported\n"); + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; + } } tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); @@ -1059,6 +1078,13 @@ static __init int svm_hardware_setup(void) pr_info("Virtual GIF supported\n"); } + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } + svm_set_cpu_caps(); /* @@ -1109,7 +1135,9 @@ static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) { - return kvm_default_tsc_scaling_ratio; + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->tsc_ratio_msr; } static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -1121,7 +1149,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) { wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); } @@ -1150,6 +1178,38 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, } } +static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (guest_cpuid_is_intel(vcpu)) { + /* + * We must intercept SYSENTER_EIP and SYSENTER_ESP + * accesses because the processor only stores 32 bits. + * For the same reason we cannot use virtual VMLOAD/VMSAVE. + */ + svm_set_intercept(svm, INTERCEPT_VMLOAD); + svm_set_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); + } else { + /* + * If hardware supports Virtual VMLOAD VMSAVE then enable it + * in VMCB and clear intercepts to avoid #VMEXIT. + */ + if (vls) { + svm_clr_intercept(svm, INTERCEPT_VMLOAD); + svm_clr_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + } + /* No need to intercept these MSRs */ + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); + } +} + static void init_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1296,11 +1356,25 @@ static void init_vmcb(struct kvm_vcpu *vcpu) } svm_hv_init_vmcb(svm->vmcb); + init_vmcb_after_set_cpuid(vcpu); vmcb_mark_all_dirty(svm->vmcb); enable_gif(svm); +} + +static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm_vcpu_init_msrpm(vcpu, svm->msrpm); + + svm_init_osvw(vcpu); + vcpu->arch.microcode_version = 0x01000065; + svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; + if (sev_es_guest(vcpu->kvm)) + sev_es_vcpu_reset(svm); } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -1311,6 +1385,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->virt_spec_ctrl = 0; init_vmcb(vcpu); + + if (!init_event) + __svm_vcpu_reset(vcpu); } void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) @@ -1370,24 +1447,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); + svm_switch_vmcb(svm, &svm->vmcb01); if (vmsa_page) svm->vmsa = page_address(vmsa_page); svm->guest_state_loaded = false; - svm_switch_vmcb(svm, &svm->vmcb01); - init_vmcb(vcpu); - - svm_vcpu_init_msrpm(vcpu, svm->msrpm); - - svm_init_osvw(vcpu); - vcpu->arch.microcode_version = 0x01000065; - - if (sev_es_guest(vcpu->kvm)) - /* Perform SEV-ES specific VMCB creation updates */ - sev_es_create_vcpu(svm); - return 0; error_free_vmsa_page: @@ -1447,7 +1513,7 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) vmsave(__sme_page_pa(sd->save_area)); } - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + if (tsc_scaling) { u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { __this_cpu_write(current_tsc_ratio, tsc_ratio); @@ -2657,6 +2723,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) struct vcpu_svm *svm = to_svm(vcpu); switch (msr_info->index) { + case MSR_AMD64_TSC_RATIO: + if (!msr_info->host_initiated && !svm->tsc_scaling_enabled) + return 1; + msr_info->data = svm->tsc_ratio_msr; + break; case MSR_STAR: msr_info->data = svm->vmcb01.ptr->save.star; break; @@ -2806,6 +2877,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; switch (ecx) { + case MSR_AMD64_TSC_RATIO: + if (!msr->host_initiated && !svm->tsc_scaling_enabled) + return 1; + + if (data & TSC_RATIO_RSVD) + return 1; + + svm->tsc_ratio_msr = data; + + if (svm->tsc_scaling_enabled && is_guest_mode(vcpu)) + nested_svm_update_tsc_ratio_msr(vcpu); + + break; case MSR_IA32_CR_PAT: if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) return 1; @@ -2918,7 +3002,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_aux = data; break; case MSR_IA32_DEBUGCTLMSR: - if (!boot_cpu_has(X86_FEATURE_LBRV)) { + if (!lbrv) { vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", __func__, data); break; @@ -4001,6 +4085,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); + svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); + svm_recalc_instruction_intercepts(vcpu, svm); /* For sev guests, the memory encryption bit is not reserved in CR3. */ @@ -4027,33 +4113,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_NESTED); } - - if (guest_cpuid_is_intel(vcpu)) { - /* - * We must intercept SYSENTER_EIP and SYSENTER_ESP - * accesses because the processor only stores 32 bits. - * For the same reason we cannot use virtual VMLOAD/VMSAVE. - */ - svm_set_intercept(svm, INTERCEPT_VMLOAD); - svm_set_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); - } else { - /* - * If hardware supports Virtual VMLOAD VMSAVE then enable it - * in VMCB and clear intercepts to avoid #VMEXIT. - */ - if (vls) { - svm_clr_intercept(svm, INTERCEPT_VMLOAD); - svm_clr_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - } - /* No need to intercept these MSRs */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); - } + init_vmcb_after_set_cpuid(vcpu); } static bool svm_has_wbinvd_exit(void) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 128a54b1fbf1..0d7bbe548ac3 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -140,6 +140,8 @@ struct vcpu_svm { u64 next_rip; u64 spec_ctrl; + + u64 tsc_ratio_msr; /* * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be * translated into the appropriate L2_CFG bits on the host to @@ -160,7 +162,8 @@ struct vcpu_svm { unsigned long int3_rip; /* cached guest cpuid flags for faster access */ - bool nrips_enabled : 1; + bool nrips_enabled : 1; + bool tsc_scaling_enabled : 1; u32 ldr_reg; u32 dfr_reg; @@ -483,6 +486,8 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu); int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); +void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); void nested_load_control_from_vmcb12(struct vcpu_svm *svm, struct vmcb_control_area *control); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); @@ -562,7 +567,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu); int sev_handle_vmgexit(struct kvm_vcpu *vcpu); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); -void sev_es_create_vcpu(struct vcpu_svm *svm); +void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); void sev_es_unmap_ghcb(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index eedcebf58004..af1bbb73430a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -191,7 +191,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) * failValid writes the error number to the current VMCS, which * can't be done if there isn't a current VMCS. */ - if (vmx->nested.current_vmptr == -1ull && + if (vmx->nested.current_vmptr == INVALID_GPA && !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) return nested_vmx_failInvalid(vcpu); @@ -218,7 +218,7 @@ static inline u64 vmx_control_msr(u32 low, u32 high) static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); vmx->nested.need_vmcs12_to_shadow_sync = false; } @@ -290,9 +290,10 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; + vmx->nested.vmxon_ptr = INVALID_GPA; free_vpid(vmx->nested.vpid02); vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; if (enable_shadow_vmcs) { vmx_disable_shadow_vmcs(vmx); vmcs_clear(vmx->vmcs01.shadow_vmcs); @@ -709,7 +710,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *shadow; if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) + vmcs12->vmcs_link_pointer == INVALID_GPA) return; shadow = get_shadow_vmcs12(vcpu); @@ -727,7 +728,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) + vmcs12->vmcs_link_pointer == INVALID_GPA) return; kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, @@ -1994,7 +1995,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( } if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; nested_release_evmcs(vcpu); @@ -2178,7 +2179,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) } if (cpu_has_vmx_encls_vmexit()) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); /* * Set the MSR load/store lists to match L0's settings. Only the @@ -2197,7 +2198,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, { prepare_vmcs02_constant_state(vmx); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) @@ -2949,7 +2950,7 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *shadow; struct kvm_host_map map; - if (vmcs12->vmcs_link_pointer == -1ull) + if (vmcs12->vmcs_link_pointer == INVALID_GPA) return 0; if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) @@ -3216,7 +3217,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to * force VM-Entry to fail. */ - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); } } @@ -3527,7 +3528,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && - vmx->nested.current_vmptr == -1ull)) + vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); @@ -4975,7 +4976,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->nested.current_vmptr == -1ull) + if (vmx->nested.current_vmptr == INVALID_GPA) return; copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); @@ -4995,7 +4996,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; } /* Emulate the VMXOFF instruction */ @@ -5090,12 +5091,12 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* - * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMREAD sets the ALU flags for VMfailInvalid. */ - if (vmx->nested.current_vmptr == -1ull || + if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && - get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); /* Decode instruction info and find the field to read */ @@ -5182,12 +5183,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; /* - * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMWRITE sets the ALU flags for VMfailInvalid. */ - if (vmx->nested.current_vmptr == -1ull || + if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && - get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); if (instr_info & BIT(10)) @@ -5630,7 +5631,7 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, gpa_t bitmap, last_bitmap; u8 b; - last_bitmap = (gpa_t)-1; + last_bitmap = INVALID_GPA; b = -1; while (size > 0) { @@ -6106,8 +6107,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, .format = KVM_STATE_NESTED_FORMAT_VMX, .size = sizeof(kvm_state), .hdr.vmx.flags = 0, - .hdr.vmx.vmxon_pa = -1ull, - .hdr.vmx.vmcs12_pa = -1ull, + .hdr.vmx.vmxon_pa = INVALID_GPA, + .hdr.vmx.vmcs12_pa = INVALID_GPA, .hdr.vmx.preemption_timer_deadline = 0, }; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = @@ -6133,7 +6134,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) + vmcs12->vmcs_link_pointer != INVALID_GPA) kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); } @@ -6209,7 +6210,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, return -EFAULT; if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { + vmcs12->vmcs_link_pointer != INVALID_GPA) { if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT; @@ -6244,11 +6245,11 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) return -EINVAL; - if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { + if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { if (kvm_state->hdr.vmx.smm.flags) return -EINVAL; - if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) + if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) return -EINVAL; /* @@ -6302,7 +6303,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmx_leave_nested(vcpu); - if (kvm_state->hdr.vmx.vmxon_pa == -1ull) + if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) return 0; vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; @@ -6315,13 +6316,13 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, /* See vmx_has_valid_vmcs12. */ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || - (kvm_state->hdr.vmx.vmcs12_pa != -1ull)) + (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) return -EINVAL; else return 0; } - if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { + if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) return -EINVAL; @@ -6366,7 +6367,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, ret = -EINVAL; if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { + vmcs12->vmcs_link_pointer != INVALID_GPA) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); if (kvm_state->size < diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 116b08904ac3..1c8b2b6e7ed9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4328,10 +4328,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) #define VMX_XSS_EXIT_BITMAP 0 -/* - * Noting that the initialization of Guest-state Area of VMCS is in - * vmx_vcpu_reset(). - */ static void init_vmcs(struct vcpu_vmx *vmx) { if (nested) @@ -4340,7 +4336,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ /* Control */ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); @@ -4436,10 +4432,40 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx_setup_uret_msrs(vmx); } +static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + init_vmcs(vmx); + + if (nested) + memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); + + vcpu_setup_sgx_lepubkeyhash(vcpu); + + vmx->nested.posted_intr_nv = -1; + vmx->nested.vmxon_ptr = INVALID_GPA; + vmx->nested.current_vmptr = INVALID_GPA; + vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; + + vcpu->arch.microcode_version = 0x100000000ULL; + vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; + + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; +} + static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!init_event) + __vmx_vcpu_reset(vcpu); + vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; @@ -4449,6 +4475,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_set_cr8(vcpu, 0); vmx_segment_cache_clear(vmx); + kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); @@ -6408,6 +6435,7 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return nested; case MSR_AMD64_VIRT_SPEC_CTRL: + case MSR_AMD64_TSC_RATIO: /* This is AMD only. */ return false; default: @@ -6815,7 +6843,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; - int i, cpu, err; + int i, err; BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); @@ -6836,10 +6864,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - for (i = 0; i < kvm_nr_uret_msrs; ++i) { - vmx->guest_uret_msrs[i].data = 0; + for (i = 0; i < kvm_nr_uret_msrs; ++i) vmx->guest_uret_msrs[i].mask = -1ull; - } if (boot_cpu_has(X86_FEATURE_RTM)) { /* * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. @@ -6876,12 +6902,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) } vmx->loaded_vmcs = &vmx->vmcs01; - cpu = get_cpu(); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - init_vmcs(vmx); - vmx_vcpu_put(vcpu); - put_cpu(); + if (cpu_need_virtualize_apic_accesses(vcpu)) { err = alloc_apic_access_page(vcpu->kvm); if (err) @@ -6894,27 +6915,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vmcs; } - if (nested) - memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); - else - memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); - - vcpu_setup_sgx_lepubkeyhash(vcpu); - - vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; - vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; - - vcpu->arch.microcode_version = 0x100000000ULL; - vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; - - /* - * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR - * or POSTED_INTR_WAKEUP_VECTOR. - */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - vmx->pi_desc.sn = 1; - return 0; free_vmcs: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aabd3a2ec1bc..db7fa1398f0d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -790,30 +790,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) } EXPORT_SYMBOL_GPL(kvm_require_dr); -/* - * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_vcpu_read_guest_page is that this function - * can read from guest physical or from the guest's guest physical memory. - */ -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t ngfn, void *data, int offset, int len, - u32 access) -{ - struct x86_exception exception; - gfn_t real_gfn; - gpa_t ngpa; - - ngpa = gfn_to_gpa(ngfn); - real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); - if (real_gfn == UNMAPPED_GVA) - return -EFAULT; - - real_gfn = gpa_to_gfn(real_gfn); - - return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); - static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); @@ -825,34 +801,38 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; - unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; + gpa_t real_gpa; int i; int ret; u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; - ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte), - PFERR_USER_MASK|PFERR_WRITE_MASK); - if (ret < 0) { - ret = 0; - goto out; - } + /* + * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated + * to an L1 GPA. + */ + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(pdpt_gfn), + PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); + if (real_gpa == UNMAPPED_GVA) + return 0; + + /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte, + cr3 & GENMASK(11, 5), sizeof(pdpte)); + if (ret < 0) + return 0; + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & pdptr_rsvd_bits(vcpu))) { - ret = 0; - goto out; + return 0; } } - ret = 1; memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); vcpu->arch.pdptrs_from_userspace = false; -out: - - return ret; + return 1; } EXPORT_SYMBOL_GPL(load_pdptrs); @@ -993,7 +973,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) /* * Do not allow the guest to set bits that we do not support * saving. However, xcr0 bit 0 is always set, even if the - * emulated CPU does not support XSAVE (see fx_init). + * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). */ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) @@ -1381,6 +1361,7 @@ static const u32 emulated_msrs_all[] = { MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, + MSR_AMD64_TSC_RATIO, MSR_IA32_POWER_CTL, MSR_IA32_UCODE_REV, @@ -2762,50 +2743,58 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } -void kvm_make_mclock_inprogress_request(struct kvm *kvm) +static void kvm_make_mclock_inprogress_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } -static void kvm_gen_update_masterclock(struct kvm *kvm) +static void kvm_start_pvclock_update(struct kvm *kvm) { -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; - unsigned long flags; - - kvm_hv_invalidate_tsc_page(kvm); kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + spin_lock_irq(&ka->pvclock_gtod_sync_lock); +} + +static void kvm_end_pvclock_update(struct kvm *kvm) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_vcpu *vcpu; + int i; + spin_unlock_irq(&ka->pvclock_gtod_sync_lock); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); -#endif } -u64 get_kvmclock_ns(struct kvm *kvm) +static void kvm_update_masterclock(struct kvm *kvm) +{ + kvm_hv_invalidate_tsc_page(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + kvm_end_pvclock_update(kvm); +} + +static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; unsigned long flags; - u64 ret; spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); if (!ka->use_master_clock) { spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); - return get_kvmclock_base_ns() + ka->kvmclock_offset; + data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; + return; } + data->flags |= KVM_CLOCK_TSC_STABLE; hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); @@ -2817,13 +2806,26 @@ u64 get_kvmclock_ns(struct kvm *kvm) kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); - ret = __pvclock_read_cycles(&hv_clock, rdtsc()); - } else - ret = get_kvmclock_base_ns() + ka->kvmclock_offset; + data->clock = __pvclock_read_cycles(&hv_clock, rdtsc()); + } else { + data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; + } put_cpu(); +} - return ret; +u64 get_kvmclock_ns(struct kvm *kvm) +{ + struct kvm_clock_data data; + + /* + * Zero flags as it's accessed RMW, leave everything else uninitialized + * as clock is always written and no other fields are consumed. + */ + data.flags = 0; + + get_kvmclock(kvm, &data); + return data.clock; } static void kvm_setup_pvclock_page(struct kvm_vcpu *v, @@ -4077,7 +4079,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_VCPUS; break; case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; + r = KVM_MAX_VCPU_IDS; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; @@ -5829,6 +5831,50 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) } #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ +static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_clock_data data; + + memset(&data, 0, sizeof(data)); + get_kvmclock(kvm, &data); + if (copy_to_user(argp, &data, sizeof(data))) + return -EFAULT; + + return 0; +} + +static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_clock_data data; + u64 now_ns; + + if (copy_from_user(&data, argp, sizeof(data))) + return -EFAULT; + + if (data.flags) + return -EINVAL; + + kvm_hv_invalidate_tsc_page(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + + /* + * This pairs with kvm_guest_time_update(): when masterclock is + * in use, we use master_kernel_ns + kvmclock_offset to set + * unsigned 'system_time' so if we use get_kvmclock_ns() (which + * is slightly ahead) here we risk going negative on unsigned + * 'system_time' when 'data.clock' is very small. + */ + if (kvm->arch.use_master_clock) + now_ns = ka->master_kernel_ns; + else + now_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = data.clock - now_ns; + kvm_end_pvclock_update(kvm); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -6072,60 +6118,12 @@ set_pit2_out: break; } #endif - case KVM_SET_CLOCK: { - struct kvm_arch *ka = &kvm->arch; - struct kvm_clock_data user_ns; - u64 now_ns; - - r = -EFAULT; - if (copy_from_user(&user_ns, argp, sizeof(user_ns))) - goto out; - - r = -EINVAL; - if (user_ns.flags) - goto out; - - r = 0; - /* - * TODO: userspace has to take care of races with VCPU_RUN, so - * kvm_gen_update_masterclock() can be cut down to locked - * pvclock_update_vm_gtod_copy(). - */ - kvm_gen_update_masterclock(kvm); - - /* - * This pairs with kvm_guest_time_update(): when masterclock is - * in use, we use master_kernel_ns + kvmclock_offset to set - * unsigned 'system_time' so if we use get_kvmclock_ns() (which - * is slightly ahead) here we risk going negative on unsigned - * 'system_time' when 'user_ns.clock' is very small. - */ - spin_lock_irq(&ka->pvclock_gtod_sync_lock); - if (kvm->arch.use_master_clock) - now_ns = ka->master_kernel_ns; - else - now_ns = get_kvmclock_base_ns(); - ka->kvmclock_offset = user_ns.clock - now_ns; - spin_unlock_irq(&ka->pvclock_gtod_sync_lock); - - kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); + case KVM_SET_CLOCK: + r = kvm_vm_ioctl_set_clock(kvm, argp); break; - } - case KVM_GET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; - - now_ns = get_kvmclock_ns(kvm); - user_ns.clock = now_ns; - user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - memset(&user_ns.pad, 0, sizeof(user_ns.pad)); - - r = -EFAULT; - if (copy_to_user(argp, &user_ns, sizeof(user_ns))) - goto out; - r = 0; + case KVM_GET_CLOCK: + r = kvm_vm_ioctl_get_clock(kvm, argp); break; - } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (kvm_x86_ops.mem_enc_op) @@ -8121,14 +8119,13 @@ static void tsc_khz_changed(void *data) static void kvm_hyperv_tsc_notifier(void) { struct kvm *kvm; - struct kvm_vcpu *vcpu; int cpu; - unsigned long flags; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ hyperv_stop_tsc_emulation(); /* TSC frequency always matches when on Hyper-V */ @@ -8139,16 +8136,11 @@ static void kvm_hyperv_tsc_notifier(void) list_for_each_entry(kvm, &vm_list, vm_list) { struct kvm_arch *ka = &kvm->arch; - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + spin_lock_irq(&ka->pvclock_gtod_sync_lock); pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); + kvm_end_pvclock_update(kvm); } + mutex_unlock(&kvm_lock); } #endif @@ -9242,14 +9234,7 @@ static void process_smi(struct kvm_vcpu *vcpu) void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap) { - cpumask_var_t cpus; - - zalloc_cpumask_var(&cpus, GFP_ATOMIC); - - kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, - NULL, vcpu_bitmap, cpus); - - free_cpumask_var(cpus); + kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap); } void kvm_make_scan_ioapic_request(struct kvm *kvm) @@ -9432,7 +9417,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) - kvm_gen_update_masterclock(vcpu->kvm); + kvm_update_masterclock(vcpu->kvm); if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) kvm_gen_kvmclock_update(vcpu); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { @@ -10624,24 +10609,6 @@ static int sync_regs(struct kvm_vcpu *vcpu) return 0; } -static void fx_init(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.guest_fpu) - return; - - fpstate_init(&vcpu->arch.guest_fpu->state); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = - host_xcr0 | XSTATE_COMPACTION_ENABLED; - - /* - * Ensure guest xcr0 is valid for loading - */ - vcpu->arch.xcr0 = XFEATURE_MASK_FP; - - vcpu->arch.cr0 |= X86_CR0_ET; -} - void kvm_free_guest_fpu(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_fpu) { @@ -10720,7 +10687,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) pr_err("kvm: failed to allocate vcpu's fpu\n"); goto free_user_fpu; } - fx_init(vcpu); + fpstate_init(&vcpu->arch.guest_fpu->state); + if (boot_cpu_has(X86_FEATURE_XSAVES)) + vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = + host_xcr0 | XSTATE_COMPACTION_ENABLED; vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); @@ -10821,9 +10791,19 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + struct kvm_cpuid_entry2 *cpuid_0x1; unsigned long old_cr0 = kvm_read_cr0(vcpu); unsigned long new_cr0; - u32 eax, dummy; + + /* + * Several of the "set" flows, e.g. ->set_cr0(), read other registers + * to handle side effects. RESET emulation hits those flows and relies + * on emulated/virtualized registers, including those that are loaded + * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel + * to detect improper or missing initialization. + */ + WARN_ON_ONCE(!init_event && + (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu))); kvm_lapic_reset(vcpu, init_event); @@ -10886,21 +10866,19 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.xcr0 = XFEATURE_MASK_FP; } + /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; + kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP); /* * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) * if no CPUID match is found. Note, it's impossible to get a match at * RESET since KVM emulates RESET before exposing the vCPU to userspace, - * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. - * But, go through the motions in case that's ever remedied. + * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry + * on RESET. But, go through the motions in case that's ever remedied. */ - eax = 1; - if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) - eax = 0x600; - kvm_rdx_write(vcpu, eax); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); vcpu->arch.ia32_xss = 0; @@ -11152,7 +11130,7 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_free_vm(struct kvm *kvm) { kfree(to_kvm_hv(kvm)->hv_pa_pg); - vfree(kvm); + __kvm_arch_free_vm(kvm); } @@ -11498,7 +11476,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, } } - if (kvm_page_track_create_memslot(slot, npages)) + if (kvm_page_track_create_memslot(kvm, slot, npages)) goto out_free; return 0; @@ -12096,6 +12074,15 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); } +bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + if (new->type != KVM_IRQ_ROUTING_MSI) + return true; + + return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); +} + bool kvm_vector_hashing_enabled(void) { return vector_hashing; diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index f960f5d7664e..107762427648 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -126,6 +126,7 @@ config DRM_I915_GVT_KVMGT depends on DRM_I915_GVT depends on KVM depends on VFIO_MDEV + select KVM_EXTERNAL_WRITE_TRACKING default n help Choose this option if you want to enable KVMGT support for diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0f18df7fe874..60a35d9fe259 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -39,8 +39,8 @@ #include <asm/kvm_host.h> #include <linux/kvm_dirty_ring.h> -#ifndef KVM_MAX_VCPU_ID -#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS +#ifndef KVM_MAX_VCPU_IDS +#define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS #endif /* @@ -160,8 +160,7 @@ static inline bool is_error_page(struct page *page) #define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0) bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except, - unsigned long *vcpu_bitmap, cpumask_var_t tmp); + unsigned long *vcpu_bitmap); bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, struct kvm_vcpu *except); @@ -1082,10 +1081,17 @@ static inline struct kvm *kvm_arch_alloc_vm(void) { return kzalloc(sizeof(struct kvm), GFP_KERNEL); } +#endif + +static inline void __kvm_arch_free_vm(struct kvm *kvm) +{ + kvfree(kvm); +} +#ifndef __KVM_HAVE_ARCH_VM_FREE static inline void kvm_arch_free_vm(struct kvm *kvm) { - kfree(kvm); + __kvm_arch_free_vm(kvm); } #endif @@ -1765,6 +1771,8 @@ void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); +bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, + struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index 0299cd81b8ba..f968dfd4ee88 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -53,7 +53,7 @@ int main(int argc, char *argv[]) kvm_max_vcpu_id = kvm_max_vcpus; TEST_ASSERT(kvm_max_vcpu_id >= kvm_max_vcpus, - "KVM_MAX_VCPU_ID (%d) must be at least as large as KVM_MAX_VCPUS (%d).", + "KVM_MAX_VCPU_IDS (%d) must be at least as large as KVM_MAX_VCPUS (%d).", kvm_max_vcpu_id, kvm_max_vcpus); test_vcpu_creation(0, kvm_max_vcpus); diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c index a0d0c83d83de..50e0cf41a7dd 100644 --- a/tools/testing/selftests/kvm/lib/sparsebit.c +++ b/tools/testing/selftests/kvm/lib/sparsebit.c @@ -1866,7 +1866,7 @@ void sparsebit_validate_internal(struct sparsebit *s) * of total bits set. */ if (s->num_set != total_bits_set) { - fprintf(stderr, "Number of bits set missmatch,\n" + fprintf(stderr, "Number of bits set mismatch,\n" " s->num_set: 0x%lx total_bits_set: 0x%lx", s->num_set, total_bits_set); diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index f40fd097cb35..6f6fd189dda3 100644 --- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -109,8 +109,7 @@ int main(int argc, char *argv[]) } } - kvm_vm_free(vm); - done: + kvm_vm_free(vm); return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c index 7e33a350b053..e683d0ac3e45 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -161,7 +161,7 @@ int main(int argc, char *argv[]) } } - kvm_vm_free(vm); done: + kvm_vm_free(vm); return 0; } diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index e996989cd580..2ad013b8bde9 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -281,6 +281,13 @@ int __attribute__((weak)) kvm_arch_update_irqfd_routing( { return 0; } + +bool __attribute__((weak)) kvm_arch_irqfd_route_changed( + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + return true; +} #endif static int @@ -615,10 +622,16 @@ void kvm_irq_routing_update(struct kvm *kvm) spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry(irqfd, &kvm->irqfds.items, list) { +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS + /* Under irqfds.lock, so can read irq_entry safely */ + struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry; +#endif + irqfd_update(kvm, irqfd); #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS - if (irqfd->producer) { + if (irqfd->producer && + kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { int ret = kvm_arch_update_irqfd_routing( irqfd->kvm, irqfd->producer->irq, irqfd->gsi, 1); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7851f3a1b5f7..3f6d450355f0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -155,6 +155,8 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms; +static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask); + __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, unsigned long start, unsigned long end) { @@ -235,15 +237,8 @@ static void ack_flush(void *_completed) { } -static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait) +static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait) { - const struct cpumask *cpus; - - if (likely(cpumask_available(tmp))) - cpus = tmp; - else - cpus = cpu_online_mask; - if (cpumask_empty(cpus)) return false; @@ -251,53 +246,55 @@ static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait) return true; } +static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int req, struct cpumask *tmp, + int current_cpu) +{ + int cpu; + + kvm_make_request(req, vcpu); + + if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) + return; + + /* + * Note, the vCPU could get migrated to a different pCPU at any point + * after kvm_request_needs_ipi(), which could result in sending an IPI + * to the previous pCPU. But, that's OK because the purpose of the IPI + * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is + * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES + * after this point is also OK, as the requirement is only that KVM wait + * for vCPUs that were reading SPTEs _before_ any changes were + * finalized. See kvm_vcpu_kick() for more details on handling requests. + */ + if (kvm_request_needs_ipi(vcpu, req)) { + cpu = READ_ONCE(vcpu->cpu); + if (cpu != -1 && cpu != current_cpu) + __cpumask_set_cpu(cpu, tmp); + } +} + bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except, - unsigned long *vcpu_bitmap, cpumask_var_t tmp) + unsigned long *vcpu_bitmap) { - int i, cpu, me; struct kvm_vcpu *vcpu; + struct cpumask *cpus; + int i, me; bool called; me = get_cpu(); - kvm_for_each_vcpu(i, vcpu, kvm) { - if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) || - vcpu == except) - continue; + cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); + cpumask_clear(cpus); - kvm_make_request(req, vcpu); - - if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) + for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) { + vcpu = kvm_get_vcpu(kvm, i); + if (!vcpu) continue; - - /* - * tmp can be "unavailable" if cpumasks are allocated off stack - * as allocation of the mask is deliberately not fatal and is - * handled by falling back to kicking all online CPUs. - */ - if (!cpumask_available(tmp)) - continue; - - /* - * Note, the vCPU could get migrated to a different pCPU at any - * point after kvm_request_needs_ipi(), which could result in - * sending an IPI to the previous pCPU. But, that's ok because - * the purpose of the IPI is to ensure the vCPU returns to - * OUTSIDE_GUEST_MODE, which is satisfied if the vCPU migrates. - * Entering READING_SHADOW_PAGE_TABLES after this point is also - * ok, as the requirement is only that KVM wait for vCPUs that - * were reading SPTEs _before_ any changes were finalized. See - * kvm_vcpu_kick() for more details on handling requests. - */ - if (kvm_request_needs_ipi(vcpu, req)) { - cpu = READ_ONCE(vcpu->cpu); - if (cpu != -1 && cpu != me) - __cpumask_set_cpu(cpu, tmp); - } + kvm_make_vcpu_request(kvm, vcpu, req, cpus, me); } - called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); + called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); put_cpu(); return called; @@ -306,14 +303,25 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, struct kvm_vcpu *except) { - cpumask_var_t cpus; + struct kvm_vcpu *vcpu; + struct cpumask *cpus; bool called; + int i, me; + + me = get_cpu(); - zalloc_cpumask_var(&cpus, GFP_ATOMIC); + cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); + cpumask_clear(cpus); - called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu == except) + continue; + kvm_make_vcpu_request(kvm, vcpu, req, cpus, me); + } + + called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); + put_cpu(); - free_cpumask_var(cpus); return called; } @@ -3523,7 +3531,7 @@ static const struct vm_operations_struct kvm_vcpu_vm_ops = { static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) { struct kvm_vcpu *vcpu = file->private_data; - unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + unsigned long pages = vma_pages(vma); if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) || kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) && @@ -3587,7 +3595,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) struct kvm_vcpu *vcpu; struct page *page; - if (id >= KVM_MAX_VCPU_ID) + if (id >= KVM_MAX_VCPU_IDS) return -EINVAL; mutex_lock(&kvm->lock); @@ -5543,9 +5551,17 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, goto out_free_3; } + for_each_possible_cpu(cpu) { + if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu))) { + r = -ENOMEM; + goto out_free_4; + } + } + r = kvm_async_pf_init(); if (r) - goto out_free; + goto out_free_5; kvm_chardev_ops.owner = module; kvm_vm_fops.owner = module; @@ -5571,7 +5587,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, out_unreg: kvm_async_pf_deinit(); -out_free: +out_free_5: + for_each_possible_cpu(cpu) + free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); +out_free_4: kmem_cache_destroy(kvm_vcpu_cache); out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); @@ -5591,8 +5610,12 @@ EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { + int cpu; + debugfs_remove_recursive(kvm_debugfs_dir); misc_deregister(&kvm_dev); + for_each_possible_cpu(cpu) + free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); kvm_async_pf_deinit(); unregister_syscore_ops(&kvm_syscore_ops); |