diff options
93 files changed, 3322 insertions, 1246 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 0b5a33ee71ee..f0b76ff5030d 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8819,6 +8819,8 @@ means the VM type with value @n is supported. Possible values of @n are:: #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 + #define KVM_X86_SEV_VM 2 + #define KVM_X86_SEV_ES_VM 3 Note, KVM_X86_SW_PROTECTED_VM is currently only for development and testing. Do not use KVM_X86_SW_PROTECTED_VM for "real" VMs, and especially not in diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 84335d119ff1..9677a0714a39 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -76,15 +76,56 @@ are defined in ``<linux/psp-dev.h>``. KVM implements the following commands to support common lifecycle events of SEV guests, such as launching, running, snapshotting, migrating and decommissioning. -1. KVM_SEV_INIT ---------------- +1. KVM_SEV_INIT2 +---------------- -The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform +The KVM_SEV_INIT2 command is used by the hypervisor to initialize the SEV platform context. In a typical workflow, this command should be the first command issued. +For this command to be accepted, either KVM_X86_SEV_VM or KVM_X86_SEV_ES_VM +must have been passed to the KVM_CREATE_VM ioctl. A virtual machine created +with those machine types in turn cannot be run until KVM_SEV_INIT2 is invoked. + +Parameters: struct kvm_sev_init (in) Returns: 0 on success, -negative on error +:: + + struct kvm_sev_init { + __u64 vmsa_features; /* initial value of features field in VMSA */ + __u32 flags; /* must be 0 */ + __u16 ghcb_version; /* maximum guest GHCB version allowed */ + __u16 pad1; + __u32 pad2[8]; + }; + +It is an error if the hypervisor does not support any of the bits that +are set in ``flags`` or ``vmsa_features``. ``vmsa_features`` must be +0 for SEV virtual machines, as they do not have a VMSA. + +``ghcb_version`` must be 0 for SEV virtual machines, as they do not issue GHCB +requests. If ``ghcb_version`` is 0 for any other guest type, then the maximum +allowed guest GHCB protocol will default to version 2. + +This command replaces the deprecated KVM_SEV_INIT and KVM_SEV_ES_INIT commands. +The commands did not have any parameters (the ```data``` field was unused) and +only work for the KVM_X86_DEFAULT_VM machine type (0). + +They behave as if: + +* the VM type is KVM_X86_SEV_VM for KVM_SEV_INIT, or KVM_X86_SEV_ES_VM for + KVM_SEV_ES_INIT + +* the ``flags`` and ``vmsa_features`` fields of ``struct kvm_sev_init`` are + set to zero, and ``ghcb_version`` is set to 0 for KVM_SEV_INIT and 1 for + KVM_SEV_ES_INIT. + +If the ``KVM_X86_SEV_VMSA_FEATURES`` attribute does not exist, the hypervisor only +supports KVM_SEV_INIT and KVM_SEV_ES_INIT. In that case, note that KVM_SEV_ES_INIT +might set the debug swap VMSA feature (bit 5) depending on the value of the +``debug_swap`` parameter of ``kvm-amd.ko``. + 2. KVM_SEV_LAUNCH_START ----------------------- @@ -425,6 +466,18 @@ issued by the hypervisor to make the guest ready for execution. Returns: 0 on success, -negative on error +Device attribute API +==================== + +Attributes of the SEV implementation can be retrieved through the +``KVM_HAS_DEVICE_ATTR`` and ``KVM_GET_DEVICE_ATTR`` ioctls on the ``/dev/kvm`` +device node, using group ``KVM_X86_GRP_SEV``. + +Currently only one attribute is implemented: + +* ``KVM_X86_SEV_VMSA_FEATURES``: return the set of all bits that + are accepted in the ``vmsa_features`` of ``KVM_SEV_INIT2``. + Firmware Management =================== diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index dc04bc767865..ff17849be9f4 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1768,40 +1768,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) return false; } -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - kvm_pfn_t pfn = pte_pfn(range->arg.pte); - - if (!kvm->arch.mmu.pgt) - return false; - - WARN_ON(range->end - range->start != 1); - - /* - * If the page isn't tagged, defer to user_mem_abort() for sanitising - * the MTE tags. The S2 pte should have been unmapped by - * mmu_notifier_invalidate_range_end(). - */ - if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn))) - return false; - - /* - * We've moved a page around, probably through CoW, so let's treat - * it just like a translation fault and the map handler will clean - * the cache to the PoC. - * - * The MMU notifiers will have unmapped a huge PMD before calling - * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and - * therefore we never need to clear out a huge PMD through this - * calling path and a memcache is not required. - */ - kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, - PAGE_SIZE, __pfn_to_phys(pfn), - KVM_PGTABLE_PROT_R, NULL, 0); - - return false; -} - bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { u64 size = (range->end - range->start) << PAGE_SHIFT; diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index fc2edaefd688..c87b6ea0ec47 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -236,7 +236,6 @@ void kvm_flush_tlb_all(void); void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa); int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write); -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index a556cff35740..98883aa23ab8 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -494,38 +494,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) range->end << PAGE_SHIFT, &ctx); } -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - unsigned long prot_bits; - kvm_pte_t *ptep; - kvm_pfn_t pfn = pte_pfn(range->arg.pte); - gpa_t gpa = range->start << PAGE_SHIFT; - - ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); - if (!ptep) - return false; - - /* Replacing an absent or old page doesn't need flushes */ - if (!kvm_pte_present(NULL, ptep) || !kvm_pte_young(*ptep)) { - kvm_set_pte(ptep, 0); - return false; - } - - /* Fill new pte if write protected or page migrated */ - prot_bits = _PAGE_PRESENT | __READABLE; - prot_bits |= _CACHE_MASK & pte_val(range->arg.pte); - - /* - * Set _PAGE_WRITE or _PAGE_DIRTY iff old and new pte both support - * _PAGE_WRITE for map_page_fast if next page write fault - * _PAGE_DIRTY since gpa has already recorded as dirty page - */ - prot_bits |= __WRITEABLE & *ptep & pte_val(range->arg.pte); - kvm_set_pte(ptep, kvm_pfn_pte(pfn, __pgprot(prot_bits))); - - return true; -} - bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_ptw_ctx ctx; diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 467ee6b95ae1..c17157e700c0 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -444,36 +444,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) return true; } -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - gpa_t gpa = range->start << PAGE_SHIFT; - pte_t hva_pte = range->arg.pte; - pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); - pte_t old_pte; - - if (!gpa_pte) - return false; - - /* Mapping may need adjusting depending on memslot flags */ - old_pte = *gpa_pte; - if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte)) - hva_pte = pte_mkclean(hva_pte); - else if (range->slot->flags & KVM_MEM_READONLY) - hva_pte = pte_wrprotect(hva_pte); - - set_pte(gpa_pte, hva_pte); - - /* Replacing an absent or old page doesn't need flushes */ - if (!pte_present(old_pte) || !pte_young(old_pte)) - return false; - - /* Pages swapped, aged, moved, or cleaned require flushes */ - return !pte_present(hva_pte) || - !pte_young(hva_pte) || - pte_pfn(old_pte) != pte_pfn(hva_pte) || - (pte_dirty(old_pte) && !pte_dirty(hva_pte)); -} - bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end); diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 3281215097cc..ca3829d47ab7 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -287,7 +287,6 @@ struct kvmppc_ops { bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); - bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); void (*free_memslot)(struct kvm_memory_slot *slot); int (*init_vm)(struct kvm *kvm); void (*destroy_vm)(struct kvm *kvm); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 8acec144120e..0d0624088e6b 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -899,11 +899,6 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return kvm->arch.kvm_ops->test_age_gfn(kvm, range); } -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - return kvm->arch.kvm_ops->set_spte_gfn(kvm, range); -} - int kvmppc_core_init_vm(struct kvm *kvm) { diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 58391b4b32ed..4aa2ab89afbc 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -12,7 +12,6 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); -extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 2b1f0cdd8c18..1b51b1c4713b 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1010,18 +1010,6 @@ bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) return kvm_test_age_rmapp(kvm, range->slot, range->start); } -bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) -{ - WARN_ON(range->start + 1 != range->end); - - if (kvm_is_radix(kvm)) - kvm_unmap_radix(kvm, range->slot, range->start); - else - kvm_unmap_rmapp(kvm, range->slot, range->start); - - return false; -} - static int vcpus_running(struct kvm *kvm) { return atomic_read(&kvm->arch.vcpus_running) != 0; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8e86eb577eb8..35cb014a0c51 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -6364,7 +6364,6 @@ static struct kvmppc_ops kvm_ops_hv = { .unmap_gfn_range = kvm_unmap_gfn_range_hv, .age_gfn = kvm_age_gfn_hv, .test_age_gfn = kvm_test_age_gfn_hv, - .set_spte_gfn = kvm_set_spte_gfn_hv, .free_memslot = kvmppc_core_free_memslot_hv, .init_vm = kvmppc_core_init_vm_hv, .destroy_vm = kvmppc_core_destroy_vm_hv, diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 5b92619a05fd..a7d7137ea0c8 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -461,12 +461,6 @@ static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range) return false; } -static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range) -{ - /* The page will get remapped properly on its next fault */ - return do_kvm_unmap_gfn(kvm, range); -} - /*****************************************/ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) @@ -2071,7 +2065,6 @@ static struct kvmppc_ops kvm_ops_pr = { .unmap_gfn_range = kvm_unmap_gfn_range_pr, .age_gfn = kvm_age_gfn_pr, .test_age_gfn = kvm_test_age_gfn_pr, - .set_spte_gfn = kvm_set_spte_gfn_pr, .free_memslot = kvmppc_core_free_memslot_pr, .init_vm = kvmppc_core_init_vm_pr, .destroy_vm = kvmppc_core_destroy_vm_pr, diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index ccb8f16ffe41..c664fdec75b1 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -747,12 +747,6 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return false; } -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - /* The page will get remapped properly on its next fault */ - return kvm_e500_mmu_unmap_gfn(kvm, range); -} - /*****************************************/ int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500) diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 2468c55933cd..25966995da04 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -168,7 +168,8 @@ #define VSIP_TO_HVIP_SHIFT (IRQ_VS_SOFT - IRQ_S_SOFT) #define VSIP_VALID_MASK ((_AC(1, UL) << IRQ_S_SOFT) | \ (_AC(1, UL) << IRQ_S_TIMER) | \ - (_AC(1, UL) << IRQ_S_EXT)) + (_AC(1, UL) << IRQ_S_EXT) | \ + (_AC(1, UL) << IRQ_PMU_OVF)) /* AIA CSR bits */ #define TOPI_IID_SHIFT 16 @@ -281,7 +282,7 @@ #define CSR_HPMCOUNTER30H 0xc9e #define CSR_HPMCOUNTER31H 0xc9f -#define CSR_SSCOUNTOVF 0xda0 +#define CSR_SCOUNTOVF 0xda0 #define CSR_SSTATUS 0x100 #define CSR_SIE 0x104 diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 484d04a92fa6..d96281278586 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -43,6 +43,17 @@ KVM_ARCH_REQ_FLAGS(5, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(6) +#define KVM_HEDELEG_DEFAULT (BIT(EXC_INST_MISALIGNED) | \ + BIT(EXC_BREAKPOINT) | \ + BIT(EXC_SYSCALL) | \ + BIT(EXC_INST_PAGE_FAULT) | \ + BIT(EXC_LOAD_PAGE_FAULT) | \ + BIT(EXC_STORE_PAGE_FAULT)) + +#define KVM_HIDELEG_DEFAULT (BIT(IRQ_VS_SOFT) | \ + BIT(IRQ_VS_TIMER) | \ + BIT(IRQ_VS_EXT)) + enum kvm_riscv_hfence_type { KVM_RISCV_HFENCE_UNKNOWN = 0, KVM_RISCV_HFENCE_GVMA_VMID_GPA, @@ -169,6 +180,7 @@ struct kvm_vcpu_csr { struct kvm_vcpu_config { u64 henvcfg; u64 hstateen0; + unsigned long hedeleg; }; struct kvm_vcpu_smstateen_csr { @@ -211,6 +223,7 @@ struct kvm_vcpu_arch { /* CPU context upon Guest VCPU reset */ struct kvm_cpu_context guest_reset_context; + spinlock_t reset_cntx_lock; /* CPU CSR context upon Guest VCPU reset */ struct kvm_vcpu_csr guest_reset_csr; @@ -252,8 +265,9 @@ struct kvm_vcpu_arch { /* Cache pages needed to program page tables with spinlock held */ struct kvm_mmu_memory_cache mmu_page_cache; - /* VCPU power-off state */ - bool power_off; + /* VCPU power state */ + struct kvm_mp_state mp_state; + spinlock_t mp_state_lock; /* Don't run the VCPU (blocked) */ bool pause; @@ -374,8 +388,11 @@ int kvm_riscv_vcpu_unset_interrupt(struct kvm_vcpu *vcpu, unsigned int irq); void kvm_riscv_vcpu_flush_interrupts(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu); bool kvm_riscv_vcpu_has_interrupts(struct kvm_vcpu *vcpu, u64 mask); +void __kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu); +void __kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu); +bool kvm_riscv_vcpu_stopped(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu); diff --git a/arch/riscv/include/asm/kvm_vcpu_pmu.h b/arch/riscv/include/asm/kvm_vcpu_pmu.h index 395518a1664e..fa0f535bbbf0 100644 --- a/arch/riscv/include/asm/kvm_vcpu_pmu.h +++ b/arch/riscv/include/asm/kvm_vcpu_pmu.h @@ -20,7 +20,7 @@ static_assert(RISCV_KVM_MAX_COUNTERS <= 64); struct kvm_fw_event { /* Current value of the event */ - unsigned long value; + u64 value; /* Event monitoring status */ bool started; @@ -36,6 +36,7 @@ struct kvm_pmc { bool started; /* Monitoring event ID */ unsigned long event_idx; + struct kvm_vcpu *vcpu; }; /* PMU data structure per vcpu */ @@ -50,6 +51,12 @@ struct kvm_pmu { bool init_done; /* Bit map of all the virtual counter used */ DECLARE_BITMAP(pmc_in_use, RISCV_KVM_MAX_COUNTERS); + /* Bit map of all the virtual counter overflown */ + DECLARE_BITMAP(pmc_overflown, RISCV_KVM_MAX_COUNTERS); + /* The address of the counter snapshot area (guest physical address) */ + gpa_t snapshot_addr; + /* The actual data of the snapshot */ + struct riscv_pmu_snapshot_data *sdata; }; #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu_context) @@ -82,9 +89,14 @@ int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba unsigned long ctr_mask, unsigned long flags, unsigned long eidx, u64 evtdata, struct kvm_vcpu_sbi_return *retdata); -int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, +int kvm_riscv_vcpu_pmu_fw_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, struct kvm_vcpu_sbi_return *retdata); +int kvm_riscv_vcpu_pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx, + struct kvm_vcpu_sbi_return *retdata); void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu); +int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long saddr_low, + unsigned long saddr_high, unsigned long flags, + struct kvm_vcpu_sbi_return *retdata); void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu); diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 6e68f8dff76b..112a0a0d9f46 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -131,6 +131,8 @@ enum sbi_ext_pmu_fid { SBI_EXT_PMU_COUNTER_START, SBI_EXT_PMU_COUNTER_STOP, SBI_EXT_PMU_COUNTER_FW_READ, + SBI_EXT_PMU_COUNTER_FW_READ_HI, + SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, }; union sbi_pmu_ctr_info { @@ -147,6 +149,13 @@ union sbi_pmu_ctr_info { }; }; +/* Data structure to contain the pmu snapshot data */ +struct riscv_pmu_snapshot_data { + u64 ctr_overflow_mask; + u64 ctr_values[64]; + u64 reserved[447]; +}; + #define RISCV_PMU_RAW_EVENT_MASK GENMASK_ULL(47, 0) #define RISCV_PMU_RAW_EVENT_IDX 0x20000 @@ -232,20 +241,22 @@ enum sbi_pmu_ctr_type { #define SBI_PMU_EVENT_IDX_INVALID 0xFFFFFFFF /* Flags defined for config matching function */ -#define SBI_PMU_CFG_FLAG_SKIP_MATCH (1 << 0) -#define SBI_PMU_CFG_FLAG_CLEAR_VALUE (1 << 1) -#define SBI_PMU_CFG_FLAG_AUTO_START (1 << 2) -#define SBI_PMU_CFG_FLAG_SET_VUINH (1 << 3) -#define SBI_PMU_CFG_FLAG_SET_VSINH (1 << 4) -#define SBI_PMU_CFG_FLAG_SET_UINH (1 << 5) -#define SBI_PMU_CFG_FLAG_SET_SINH (1 << 6) -#define SBI_PMU_CFG_FLAG_SET_MINH (1 << 7) +#define SBI_PMU_CFG_FLAG_SKIP_MATCH BIT(0) +#define SBI_PMU_CFG_FLAG_CLEAR_VALUE BIT(1) +#define SBI_PMU_CFG_FLAG_AUTO_START BIT(2) +#define SBI_PMU_CFG_FLAG_SET_VUINH BIT(3) +#define SBI_PMU_CFG_FLAG_SET_VSINH BIT(4) +#define SBI_PMU_CFG_FLAG_SET_UINH BIT(5) +#define SBI_PMU_CFG_FLAG_SET_SINH BIT(6) +#define SBI_PMU_CFG_FLAG_SET_MINH BIT(7) /* Flags defined for counter start function */ -#define SBI_PMU_START_FLAG_SET_INIT_VALUE (1 << 0) +#define SBI_PMU_START_FLAG_SET_INIT_VALUE BIT(0) +#define SBI_PMU_START_FLAG_INIT_SNAPSHOT BIT(1) /* Flags defined for counter stop function */ -#define SBI_PMU_STOP_FLAG_RESET (1 << 0) +#define SBI_PMU_STOP_FLAG_RESET BIT(0) +#define SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT BIT(1) enum sbi_ext_dbcn_fid { SBI_EXT_DBCN_CONSOLE_WRITE = 0, @@ -266,7 +277,7 @@ struct sbi_sta_struct { u8 pad[47]; } __packed; -#define SBI_STA_SHMEM_DISABLE -1 +#define SBI_SHMEM_DISABLE -1 /* SBI spec version fields */ #define SBI_SPEC_VERSION_DEFAULT 0x1 @@ -284,6 +295,7 @@ struct sbi_sta_struct { #define SBI_ERR_ALREADY_AVAILABLE -6 #define SBI_ERR_ALREADY_STARTED -7 #define SBI_ERR_ALREADY_STOPPED -8 +#define SBI_ERR_NO_SHMEM -9 extern unsigned long sbi_spec_version; struct sbiret { @@ -355,8 +367,8 @@ static inline unsigned long sbi_minor_version(void) static inline unsigned long sbi_mk_version(unsigned long major, unsigned long minor) { - return ((major & SBI_SPEC_VERSION_MAJOR_MASK) << - SBI_SPEC_VERSION_MAJOR_SHIFT) | minor; + return ((major & SBI_SPEC_VERSION_MAJOR_MASK) << SBI_SPEC_VERSION_MAJOR_SHIFT) + | (minor & SBI_SPEC_VERSION_MINOR_MASK); } int sbi_err_map_linux_errno(int err); diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index b1c503c2959c..e878e7cc3978 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -167,6 +167,7 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_ZFA, KVM_RISCV_ISA_EXT_ZTSO, KVM_RISCV_ISA_EXT_ZACAS, + KVM_RISCV_ISA_EXT_SSCOFPMF, KVM_RISCV_ISA_EXT_MAX, }; diff --git a/arch/riscv/kernel/paravirt.c b/arch/riscv/kernel/paravirt.c index 0d6225fd3194..fa6b0339a65d 100644 --- a/arch/riscv/kernel/paravirt.c +++ b/arch/riscv/kernel/paravirt.c @@ -62,7 +62,7 @@ static int sbi_sta_steal_time_set_shmem(unsigned long lo, unsigned long hi, ret = sbi_ecall(SBI_EXT_STA, SBI_EXT_STA_STEAL_TIME_SET_SHMEM, lo, hi, flags, 0, 0, 0); if (ret.error) { - if (lo == SBI_STA_SHMEM_DISABLE && hi == SBI_STA_SHMEM_DISABLE) + if (lo == SBI_SHMEM_DISABLE && hi == SBI_SHMEM_DISABLE) pr_warn("Failed to disable steal-time shmem"); else pr_warn("Failed to set steal-time shmem"); @@ -84,8 +84,8 @@ static int pv_time_cpu_online(unsigned int cpu) static int pv_time_cpu_down_prepare(unsigned int cpu) { - return sbi_sta_steal_time_set_shmem(SBI_STA_SHMEM_DISABLE, - SBI_STA_SHMEM_DISABLE, 0); + return sbi_sta_steal_time_set_shmem(SBI_SHMEM_DISABLE, + SBI_SHMEM_DISABLE, 0); } static u64 pv_time_steal_clock(int cpu) diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index a944294f6f23..0f0a9d11bb5f 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -545,6 +545,9 @@ void kvm_riscv_aia_enable(void) enable_percpu_irq(hgei_parent_irq, irq_get_trigger_type(hgei_parent_irq)); csr_set(CSR_HIE, BIT(IRQ_S_GEXT)); + /* Enable IRQ filtering for overflow interrupt only if sscofpmf is present */ + if (__riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSCOFPMF)) + csr_write(CSR_HVIEN, BIT(IRQ_PMU_OVF)); } void kvm_riscv_aia_disable(void) @@ -558,6 +561,8 @@ void kvm_riscv_aia_disable(void) return; hgctrl = get_cpu_ptr(&aia_hgei); + if (__riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSCOFPMF)) + csr_clear(CSR_HVIEN, BIT(IRQ_PMU_OVF)); /* Disable per-CPU SGEI interrupt */ csr_clear(CSR_HIE, BIT(IRQ_S_GEXT)); disable_percpu_irq(hgei_parent_irq); diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 225a435d9c9a..bab2ec34cd87 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -22,22 +22,8 @@ long kvm_arch_dev_ioctl(struct file *filp, int kvm_arch_hardware_enable(void) { - unsigned long hideleg, hedeleg; - - hedeleg = 0; - hedeleg |= (1UL << EXC_INST_MISALIGNED); - hedeleg |= (1UL << EXC_BREAKPOINT); - hedeleg |= (1UL << EXC_SYSCALL); - hedeleg |= (1UL << EXC_INST_PAGE_FAULT); - hedeleg |= (1UL << EXC_LOAD_PAGE_FAULT); - hedeleg |= (1UL << EXC_STORE_PAGE_FAULT); - csr_write(CSR_HEDELEG, hedeleg); - - hideleg = 0; - hideleg |= (1UL << IRQ_VS_SOFT); - hideleg |= (1UL << IRQ_VS_TIMER); - hideleg |= (1UL << IRQ_VS_EXT); - csr_write(CSR_HIDELEG, hideleg); + csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT); + csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT); /* VS should access only the time counter directly. Everything else should trap */ csr_write(CSR_HCOUNTEREN, 0x02); diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index a9e2fd7245e1..b63650f9b966 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -550,26 +550,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) return false; } -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - int ret; - kvm_pfn_t pfn = pte_pfn(range->arg.pte); - - if (!kvm->arch.pgd) - return false; - - WARN_ON(range->end - range->start != 1); - - ret = gstage_map_page(kvm, NULL, range->start << PAGE_SHIFT, - __pfn_to_phys(pfn), PAGE_SIZE, true, true); - if (ret) { - kvm_debug("Failed to map G-stage page (error %d)\n", ret); - return true; - } - - return false; -} - bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { pte_t *ptep; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index b5ca9f2e98ac..17e21df36cc1 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -64,7 +64,9 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) memcpy(csr, reset_csr, sizeof(*csr)); + spin_lock(&vcpu->arch.reset_cntx_lock); memcpy(cntx, reset_cntx, sizeof(*cntx)); + spin_unlock(&vcpu->arch.reset_cntx_lock); kvm_riscv_vcpu_fp_reset(vcpu); @@ -102,6 +104,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) struct kvm_cpu_context *cntx; struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr; + spin_lock_init(&vcpu->arch.mp_state_lock); + /* Mark this VCPU never ran */ vcpu->arch.ran_atleast_once = false; vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; @@ -119,12 +123,16 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) spin_lock_init(&vcpu->arch.hfence_lock); /* Setup reset state of shadow SSTATUS and HSTATUS CSRs */ + spin_lock_init(&vcpu->arch.reset_cntx_lock); + + spin_lock(&vcpu->arch.reset_cntx_lock); cntx = &vcpu->arch.guest_reset_context; cntx->sstatus = SR_SPP | SR_SPIE; cntx->hstatus = 0; cntx->hstatus |= HSTATUS_VTW; cntx->hstatus |= HSTATUS_SPVP; cntx->hstatus |= HSTATUS_SPV; + spin_unlock(&vcpu->arch.reset_cntx_lock); if (kvm_riscv_vcpu_alloc_vector_context(vcpu, cntx)) return -ENOMEM; @@ -201,7 +209,7 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return (kvm_riscv_vcpu_has_interrupts(vcpu, -1UL) && - !vcpu->arch.power_off && !vcpu->arch.pause); + !kvm_riscv_vcpu_stopped(vcpu) && !vcpu->arch.pause); } int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) @@ -365,6 +373,13 @@ void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu) } } + /* Sync up the HVIP.LCOFIP bit changes (only clear) by the guest */ + if ((csr->hvip ^ hvip) & (1UL << IRQ_PMU_OVF)) { + if (!(hvip & (1UL << IRQ_PMU_OVF)) && + !test_and_set_bit(IRQ_PMU_OVF, v->irqs_pending_mask)) + clear_bit(IRQ_PMU_OVF, v->irqs_pending); + } + /* Sync-up AIA high interrupts */ kvm_riscv_vcpu_aia_sync_interrupts(vcpu); @@ -382,7 +397,8 @@ int kvm_riscv_vcpu_set_interrupt(struct kvm_vcpu *vcpu, unsigned int irq) if (irq < IRQ_LOCAL_MAX && irq != IRQ_VS_SOFT && irq != IRQ_VS_TIMER && - irq != IRQ_VS_EXT) + irq != IRQ_VS_EXT && + irq != IRQ_PMU_OVF) return -EINVAL; set_bit(irq, vcpu->arch.irqs_pending); @@ -397,14 +413,15 @@ int kvm_riscv_vcpu_set_interrupt(struct kvm_vcpu *vcpu, unsigned int irq) int kvm_riscv_vcpu_unset_interrupt(struct kvm_vcpu *vcpu, unsigned int irq) { /* - * We only allow VS-mode software, timer, and external + * We only allow VS-mode software, timer, counter overflow and external * interrupts when irq is one of the local interrupts * defined by RISC-V privilege specification. */ if (irq < IRQ_LOCAL_MAX && irq != IRQ_VS_SOFT && irq != IRQ_VS_TIMER && - irq != IRQ_VS_EXT) + irq != IRQ_VS_EXT && + irq != IRQ_PMU_OVF) return -EINVAL; clear_bit(irq, vcpu->arch.irqs_pending); @@ -429,26 +446,42 @@ bool kvm_riscv_vcpu_has_interrupts(struct kvm_vcpu *vcpu, u64 mask) return kvm_riscv_vcpu_aia_has_interrupts(vcpu, mask); } -void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu) +void __kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu) { - vcpu->arch.power_off = true; + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); kvm_make_request(KVM_REQ_SLEEP, vcpu); kvm_vcpu_kick(vcpu); } -void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu) +void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->arch.mp_state_lock); + __kvm_riscv_vcpu_power_off(vcpu); + spin_unlock(&vcpu->arch.mp_state_lock); +} + +void __kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu) { - vcpu->arch.power_off = false; + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); kvm_vcpu_wake_up(vcpu); } +void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->arch.mp_state_lock); + __kvm_riscv_vcpu_power_on(vcpu); + spin_unlock(&vcpu->arch.mp_state_lock); +} + +bool kvm_riscv_vcpu_stopped(struct kvm_vcpu *vcpu) +{ + return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED; +} + int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - if (vcpu->arch.power_off) - mp_state->mp_state = KVM_MP_STATE_STOPPED; - else - mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + *mp_state = READ_ONCE(vcpu->arch.mp_state); return 0; } @@ -458,25 +491,36 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, { int ret = 0; + spin_lock(&vcpu->arch.mp_state_lock); + switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: - vcpu->arch.power_off = false; + WRITE_ONCE(vcpu->arch.mp_state, *mp_state); break; case KVM_MP_STATE_STOPPED: - kvm_riscv_vcpu_power_off(vcpu); + __kvm_riscv_vcpu_power_off(vcpu); break; default: ret = -EINVAL; } + spin_unlock(&vcpu->arch.mp_state_lock); + return ret; } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - /* TODO; To be implemented later. */ - return -EINVAL; + if (dbg->control & KVM_GUESTDBG_ENABLE) { + vcpu->guest_debug = dbg->control; + vcpu->arch.cfg.hedeleg &= ~BIT(EXC_BREAKPOINT); + } else { + vcpu->guest_debug = 0; + vcpu->arch.cfg.hedeleg |= BIT(EXC_BREAKPOINT); + } + + return 0; } static void kvm_riscv_vcpu_setup_config(struct kvm_vcpu *vcpu) @@ -505,6 +549,10 @@ static void kvm_riscv_vcpu_setup_config(struct kvm_vcpu *vcpu) if (riscv_isa_extension_available(isa, SMSTATEEN)) cfg->hstateen0 |= SMSTATEEN0_SSTATEEN0; } + + cfg->hedeleg = KVM_HEDELEG_DEFAULT; + if (vcpu->guest_debug) + cfg->hedeleg &= ~BIT(EXC_BREAKPOINT); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -519,6 +567,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) csr_write(CSR_VSEPC, csr->vsepc); csr_write(CSR_VSCAUSE, csr->vscause); csr_write(CSR_VSTVAL, csr->vstval); + csr_write(CSR_HEDELEG, cfg->hedeleg); csr_write(CSR_HVIP, csr->hvip); csr_write(CSR_VSATP, csr->vsatp); csr_write(CSR_HENVCFG, cfg->henvcfg); @@ -584,11 +633,11 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) { kvm_vcpu_srcu_read_unlock(vcpu); rcuwait_wait_event(wait, - (!vcpu->arch.power_off) && (!vcpu->arch.pause), + (!kvm_riscv_vcpu_stopped(vcpu)) && (!vcpu->arch.pause), TASK_INTERRUPTIBLE); kvm_vcpu_srcu_read_lock(vcpu); - if (vcpu->arch.power_off || vcpu->arch.pause) { + if (kvm_riscv_vcpu_stopped(vcpu) || vcpu->arch.pause) { /* * Awaken to handle a signal, request to * sleep again later. diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index 2415722c01b8..5761f95abb60 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -204,6 +204,10 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) ret = kvm_riscv_vcpu_sbi_ecall(vcpu, run); break; + case EXC_BREAKPOINT: + run->exit_reason = KVM_EXIT_DEBUG; + ret = 0; + break; default: break; } diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index 994adc26db4b..c676275ea0a0 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -36,6 +36,7 @@ static const unsigned long kvm_isa_ext_arr[] = { /* Multi letter extensions (alphabetically sorted) */ KVM_ISA_EXT_ARR(SMSTATEEN), KVM_ISA_EXT_ARR(SSAIA), + KVM_ISA_EXT_ARR(SSCOFPMF), KVM_ISA_EXT_ARR(SSTC), KVM_ISA_EXT_ARR(SVINVAL), KVM_ISA_EXT_ARR(SVNAPOT), @@ -99,6 +100,9 @@ static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext) switch (ext) { case KVM_RISCV_ISA_EXT_H: return false; + case KVM_RISCV_ISA_EXT_SSCOFPMF: + /* Sscofpmf depends on interrupt filtering defined in ssaia */ + return __riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSAIA); case KVM_RISCV_ISA_EXT_V: return riscv_v_vstate_ctrl_user_allowed(); default: @@ -116,6 +120,8 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) case KVM_RISCV_ISA_EXT_C: case KVM_RISCV_ISA_EXT_I: case KVM_RISCV_ISA_EXT_M: + /* There is not architectural config bit to disable sscofpmf completely */ + case KVM_RISCV_ISA_EXT_SSCOFPMF: case KVM_RISCV_ISA_EXT_SSTC: case KVM_RISCV_ISA_EXT_SVINVAL: case KVM_RISCV_ISA_EXT_SVNAPOT: diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index 86391a5061dd..04db1f993c47 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -14,6 +14,7 @@ #include <asm/csr.h> #include <asm/kvm_vcpu_sbi.h> #include <asm/kvm_vcpu_pmu.h> +#include <asm/sbi.h> #include <linux/bitops.h> #define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs) @@ -39,7 +40,7 @@ static u64 kvm_pmu_get_sample_period(struct kvm_pmc *pmc) u64 sample_period; if (!pmc->counter_val) - sample_period = counter_val_mask + 1; + sample_period = counter_val_mask; else sample_period = (-pmc->counter_val) & counter_val_mask; @@ -196,6 +197,36 @@ static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx, return kvm_pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask); } +static int pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx, + unsigned long *out_val) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int fevent_code; + + if (!IS_ENABLED(CONFIG_32BIT)) { + pr_warn("%s: should be invoked for only RV32\n", __func__); + return -EINVAL; + } + + if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) { + pr_warn("Invalid counter id [%ld]during read\n", cidx); + return -EINVAL; + } + + pmc = &kvpmu->pmc[cidx]; + + if (pmc->cinfo.type != SBI_PMU_CTR_TYPE_FW) + return -EINVAL; + + fevent_code = get_event_code(pmc->event_idx); + pmc->counter_val = kvpmu->fw_event[fevent_code].value; + + *out_val = pmc->counter_val >> 32; + + return 0; +} + static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, unsigned long *out_val) { @@ -204,6 +235,11 @@ static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, u64 enabled, running; int fevent_code; + if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) { + pr_warn("Invalid counter id [%ld] during read\n", cidx); + return -EINVAL; + } + pmc = &kvpmu->pmc[cidx]; if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { @@ -229,8 +265,50 @@ static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ct return 0; } -static int kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr, - unsigned long flags, unsigned long eidx, unsigned long evtdata) +static void kvm_riscv_pmu_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct kvm_vcpu *vcpu = pmc->vcpu; + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct riscv_pmu *rpmu = to_riscv_pmu(perf_event->pmu); + u64 period; + + /* + * Stop the event counting by directly accessing the perf_event. + * Otherwise, this needs to deferred via a workqueue. + * That will introduce skew in the counter value because the actual + * physical counter would start after returning from this function. + * It will be stopped again once the workqueue is scheduled + */ + rpmu->pmu.stop(perf_event, PERF_EF_UPDATE); + + /* + * The hw counter would start automatically when this function returns. + * Thus, the host may continue to interrupt and inject it to the guest + * even without the guest configuring the next event. Depending on the hardware + * the host may have some sluggishness only if privilege mode filtering is not + * available. In an ideal world, where qemu is not the only capable hardware, + * this can be removed. + * FYI: ARM64 does this way while x86 doesn't do anything as such. + * TODO: Should we keep it for RISC-V ? + */ + period = -(local64_read(&perf_event->count)); + + local64_set(&perf_event->hw.period_left, 0); + perf_event->attr.sample_period = period; + perf_event->hw.sample_period = period; + + set_bit(pmc->idx, kvpmu->pmc_overflown); + kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_PMU_OVF); + + rpmu->pmu.start(perf_event, PERF_EF_RELOAD); +} + +static long kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr, + unsigned long flags, unsigned long eidx, + unsigned long evtdata) { struct perf_event *event; @@ -247,7 +325,7 @@ static int kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr */ attr->sample_period = kvm_pmu_get_sample_period(pmc); - event = perf_event_create_kernel_counter(attr, -1, current, NULL, pmc); + event = perf_event_create_kernel_counter(attr, -1, current, kvm_riscv_pmu_overflow, pmc); if (IS_ERR(event)) { pr_err("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event)); return PTR_ERR(event); @@ -310,6 +388,80 @@ int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num, return ret; } +static void kvm_pmu_clear_snapshot_area(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data); + + if (kvpmu->sdata) { + if (kvpmu->snapshot_addr != INVALID_GPA) { + memset(kvpmu->sdata, 0, snapshot_area_size); + kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr, + kvpmu->sdata, snapshot_area_size); + } else { + pr_warn("snapshot address invalid\n"); + } + kfree(kvpmu->sdata); + kvpmu->sdata = NULL; + } + kvpmu->snapshot_addr = INVALID_GPA; +} + +int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long saddr_low, + unsigned long saddr_high, unsigned long flags, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data); + int sbiret = 0; + gpa_t saddr; + unsigned long hva; + bool writable; + + if (!kvpmu || flags) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + if (saddr_low == SBI_SHMEM_DISABLE && saddr_high == SBI_SHMEM_DISABLE) { + kvm_pmu_clear_snapshot_area(vcpu); + return 0; + } + + saddr = saddr_low; + + if (saddr_high != 0) { + if (IS_ENABLED(CONFIG_32BIT)) + saddr |= ((gpa_t)saddr_high << 32); + else + sbiret = SBI_ERR_INVALID_ADDRESS; + goto out; + } + + hva = kvm_vcpu_gfn_to_hva_prot(vcpu, saddr >> PAGE_SHIFT, &writable); + if (kvm_is_error_hva(hva) || !writable) { + sbiret = SBI_ERR_INVALID_ADDRESS; + goto out; + } + + kvpmu->sdata = kzalloc(snapshot_area_size, GFP_ATOMIC); + if (!kvpmu->sdata) + return -ENOMEM; + + if (kvm_vcpu_write_guest(vcpu, saddr, kvpmu->sdata, snapshot_area_size)) { + kfree(kvpmu->sdata); + sbiret = SBI_ERR_FAILURE; + goto out; + } + + kvpmu->snapshot_addr = saddr; + +out: + retdata->err_val = sbiret; + + return 0; +} + int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu, struct kvm_vcpu_sbi_return *retdata) { @@ -343,20 +495,40 @@ int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base, int i, pmc_index, sbiret = 0; struct kvm_pmc *pmc; int fevent_code; + bool snap_flag_set = flags & SBI_PMU_START_FLAG_INIT_SNAPSHOT; if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) { sbiret = SBI_ERR_INVALID_PARAM; goto out; } + if (snap_flag_set) { + if (kvpmu->snapshot_addr == INVALID_GPA) { + sbiret = SBI_ERR_NO_SHMEM; + goto out; + } + if (kvm_vcpu_read_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata, + sizeof(struct riscv_pmu_snapshot_data))) { + pr_warn("Unable to read snapshot shared memory while starting counters\n"); + sbiret = SBI_ERR_FAILURE; + goto out; + } + } /* Start the counters that have been configured and requested by the guest */ for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) { pmc_index = i + ctr_base; if (!test_bit(pmc_index, kvpmu->pmc_in_use)) continue; + /* The guest started the counter again. Reset the overflow status */ + clear_bit(pmc_index, kvpmu->pmc_overflown); pmc = &kvpmu->pmc[pmc_index]; - if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE) + if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE) { pmc->counter_val = ival; + } else if (snap_flag_set) { + /* The counter index in the snapshot are relative to the counter base */ + pmc->counter_val = kvpmu->sdata->ctr_values[i]; + } + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { fevent_code = get_event_code(pmc->event_idx); if (fevent_code >= SBI_PMU_FW_MAX) { @@ -400,12 +572,19 @@ int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base, u64 enabled, running; struct kvm_pmc *pmc; int fevent_code; + bool snap_flag_set = flags & SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT; + bool shmem_needs_update = false; if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) { sbiret = SBI_ERR_INVALID_PARAM; goto out; } + if (snap_flag_set && kvpmu->snapshot_addr == INVALID_GPA) { + sbiret = SBI_ERR_NO_SHMEM; + goto out; + } + /* Stop the counters that have been configured and requested by the guest */ for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) { pmc_index = i + ctr_base; @@ -432,21 +611,49 @@ int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base, sbiret = SBI_ERR_ALREADY_STOPPED; } - if (flags & SBI_PMU_STOP_FLAG_RESET) { - /* Relase the counter if this is a reset request */ - pmc->counter_val += perf_event_read_value(pmc->perf_event, - &enabled, &running); + if (flags & SBI_PMU_STOP_FLAG_RESET) + /* Release the counter if this is a reset request */ kvm_pmu_release_perf_event(pmc); - } } else { sbiret = SBI_ERR_INVALID_PARAM; } + + if (snap_flag_set && !sbiret) { + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) + pmc->counter_val = kvpmu->fw_event[fevent_code].value; + else if (pmc->perf_event) + pmc->counter_val += perf_event_read_value(pmc->perf_event, + &enabled, &running); + /* + * The counter and overflow indicies in the snapshot region are w.r.to + * cbase. Modify the set bit in the counter mask instead of the pmc_index + * which indicates the absolute counter index. + */ + if (test_bit(pmc_index, kvpmu->pmc_overflown)) + kvpmu->sdata->ctr_overflow_mask |= BIT(i); + kvpmu->sdata->ctr_values[i] = pmc->counter_val; + shmem_needs_update = true; + } + if (flags & SBI_PMU_STOP_FLAG_RESET) { pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; clear_bit(pmc_index, kvpmu->pmc_in_use); + clear_bit(pmc_index, kvpmu->pmc_overflown); + if (snap_flag_set) { + /* + * Only clear the given counter as the caller is responsible to + * validate both the overflow mask and configured counters. + */ + kvpmu->sdata->ctr_overflow_mask &= ~BIT(i); + shmem_needs_update = true; + } } } + if (shmem_needs_update) + kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata, + sizeof(struct riscv_pmu_snapshot_data)); + out: retdata->err_val = sbiret; @@ -458,7 +665,8 @@ int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba unsigned long eidx, u64 evtdata, struct kvm_vcpu_sbi_return *retdata) { - int ctr_idx, ret, sbiret = 0; + int ctr_idx, sbiret = 0; + long ret; bool is_fevent; unsigned long event_code; u32 etype = kvm_pmu_get_perf_event_type(eidx); @@ -517,8 +725,10 @@ int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba kvpmu->fw_event[event_code].started = true; } else { ret = kvm_pmu_create_perf_event(pmc, &attr, flags, eidx, evtdata); - if (ret) - return ret; + if (ret) { + sbiret = SBI_ERR_NOT_SUPPORTED; + goto out; + } } set_bit(ctr_idx, kvpmu->pmc_in_use); @@ -530,7 +740,19 @@ out: return 0; } -int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, +int kvm_riscv_vcpu_pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx, + struct kvm_vcpu_sbi_return *retdata) +{ + int ret; + + ret = pmu_fw_ctr_read_hi(vcpu, cidx, &retdata->out_val); + if (ret == -EINVAL) + retdata->err_val = SBI_ERR_INVALID_PARAM; + + return 0; +} + +int kvm_riscv_vcpu_pmu_fw_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, struct kvm_vcpu_sbi_return *retdata) { int ret; @@ -566,6 +788,7 @@ void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) kvpmu->num_hw_ctrs = num_hw_ctrs + 1; kvpmu->num_fw_ctrs = SBI_PMU_FW_MAX; memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event)); + kvpmu->snapshot_addr = INVALID_GPA; if (kvpmu->num_hw_ctrs > RISCV_KVM_MAX_HW_CTRS) { pr_warn_once("Limiting the hardware counters to 32 as specified by the ISA"); @@ -585,6 +808,7 @@ void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) pmc = &kvpmu->pmc[i]; pmc->idx = i; pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; + pmc->vcpu = vcpu; if (i < kvpmu->num_hw_ctrs) { pmc->cinfo.type = SBI_PMU_CTR_TYPE_HW; if (i < 3) @@ -601,7 +825,7 @@ void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) pmc->cinfo.csr = CSR_CYCLE + i; } else { pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW; - pmc->cinfo.width = BITS_PER_LONG - 1; + pmc->cinfo.width = 63; } } @@ -617,14 +841,16 @@ void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu) if (!kvpmu) return; - for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_MAX_COUNTERS) { + for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS) { pmc = &kvpmu->pmc[i]; pmc->counter_val = 0; kvm_pmu_release_perf_event(pmc); pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; } - bitmap_zero(kvpmu->pmc_in_use, RISCV_MAX_COUNTERS); + bitmap_zero(kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS); + bitmap_zero(kvpmu->pmc_overflown, RISCV_KVM_MAX_COUNTERS); memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event)); + kvm_pmu_clear_snapshot_area(vcpu); } void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu) diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index 72a2ffb8dcd1..62f409d4176e 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -138,8 +138,11 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, unsigned long i; struct kvm_vcpu *tmp; - kvm_for_each_vcpu(i, tmp, vcpu->kvm) - tmp->arch.power_off = true; + kvm_for_each_vcpu(i, tmp, vcpu->kvm) { + spin_lock(&vcpu->arch.mp_state_lock); + WRITE_ONCE(tmp->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); + spin_unlock(&vcpu->arch.mp_state_lock); + } kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); memset(&run->system_event, 0, sizeof(run->system_event)); diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c index 7dca0e9381d9..dce667f4b6ab 100644 --- a/arch/riscv/kvm/vcpu_sbi_hsm.c +++ b/arch/riscv/kvm/vcpu_sbi_hsm.c @@ -18,13 +18,20 @@ static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu) struct kvm_cpu_context *cp = &vcpu->arch.guest_context; struct kvm_vcpu *target_vcpu; unsigned long target_vcpuid = cp->a0; + int ret = 0; target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid); if (!target_vcpu) return SBI_ERR_INVALID_PARAM; - if (!target_vcpu->arch.power_off) - return SBI_ERR_ALREADY_AVAILABLE; + spin_lock(&target_vcpu->arch.mp_state_lock); + + if (!kvm_riscv_vcpu_stopped(target_vcpu)) { + ret = SBI_ERR_ALREADY_AVAILABLE; + goto out; + } + + spin_lock(&target_vcpu->arch.reset_cntx_lock); reset_cntx = &target_vcpu->arch.guest_reset_context; /* start address */ reset_cntx->sepc = cp->a1; @@ -32,21 +39,35 @@ static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu) reset_cntx->a0 = target_vcpuid; /* private data passed from kernel */ reset_cntx->a1 = cp->a2; + spin_unlock(&target_vcpu->arch.reset_cntx_lock); + kvm_make_request(KVM_REQ_VCPU_RESET, target_vcpu); - kvm_riscv_vcpu_power_on(target_vcpu); + __kvm_riscv_vcpu_power_on(target_vcpu); - return 0; +out: + spin_unlock(&target_vcpu->arch.mp_state_lock); + + return ret; } static int kvm_sbi_hsm_vcpu_stop(struct kvm_vcpu *vcpu) { - if (vcpu->arch.power_off) - return SBI_ERR_FAILURE; + int ret = 0; - kvm_riscv_vcpu_power_off(vcpu); + spin_lock(&vcpu->arch.mp_state_lock); - return 0; + if (kvm_riscv_vcpu_stopped(vcpu)) { + ret = SBI_ERR_FAILURE; + goto out; + } + + __kvm_riscv_vcpu_power_off(vcpu); + +out: + spin_unlock(&vcpu->arch.mp_state_lock); + + return ret; } static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu) @@ -58,7 +79,7 @@ static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu) target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid); if (!target_vcpu) return SBI_ERR_INVALID_PARAM; - if (!target_vcpu->arch.power_off) + if (!kvm_riscv_vcpu_stopped(target_vcpu)) return SBI_HSM_STATE_STARTED; else if (vcpu->stat.generic.blocking) return SBI_HSM_STATE_SUSPENDED; @@ -71,14 +92,11 @@ static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, { int ret = 0; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; - struct kvm *kvm = vcpu->kvm; unsigned long funcid = cp->a6; switch (funcid) { case SBI_EXT_HSM_HART_START: - mutex_lock(&kvm->lock); ret = kvm_sbi_hsm_vcpu_start(vcpu); - mutex_unlock(&kvm->lock); break; case SBI_EXT_HSM_HART_STOP: ret = kvm_sbi_hsm_vcpu_stop(vcpu); diff --git a/arch/riscv/kvm/vcpu_sbi_pmu.c b/arch/riscv/kvm/vcpu_sbi_pmu.c index 7eca72df2cbd..e4be34e03e83 100644 --- a/arch/riscv/kvm/vcpu_sbi_pmu.c +++ b/arch/riscv/kvm/vcpu_sbi_pmu.c @@ -42,9 +42,9 @@ static int kvm_sbi_ext_pmu_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, #endif /* * This can fail if perf core framework fails to create an event. - * Forward the error to userspace because it's an error which - * happened within the host kernel. The other option would be - * to convert to an SBI error and forward to the guest. + * No need to forward the error to userspace and exit the guest. + * The operation can continue without profiling. Forward the + * appropriate SBI error to the guest. */ ret = kvm_riscv_vcpu_pmu_ctr_cfg_match(vcpu, cp->a0, cp->a1, cp->a2, cp->a3, temp, retdata); @@ -62,7 +62,16 @@ static int kvm_sbi_ext_pmu_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, ret = kvm_riscv_vcpu_pmu_ctr_stop(vcpu, cp->a0, cp->a1, cp->a2, retdata); break; case SBI_EXT_PMU_COUNTER_FW_READ: - ret = kvm_riscv_vcpu_pmu_ctr_read(vcpu, cp->a0, retdata); + ret = kvm_riscv_vcpu_pmu_fw_ctr_read(vcpu, cp->a0, retdata); + break; + case SBI_EXT_PMU_COUNTER_FW_READ_HI: + if (IS_ENABLED(CONFIG_32BIT)) + ret = kvm_riscv_vcpu_pmu_fw_ctr_read_hi(vcpu, cp->a0, retdata); + else + retdata->out_val = 0; + break; + case SBI_EXT_PMU_SNAPSHOT_SET_SHMEM: + ret = kvm_riscv_vcpu_pmu_snapshot_set_shmem(vcpu, cp->a0, cp->a1, cp->a2, retdata); break; default: retdata->err_val = SBI_ERR_NOT_SUPPORTED; diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c index d8cf9ca28c61..5f35427114c1 100644 --- a/arch/riscv/kvm/vcpu_sbi_sta.c +++ b/arch/riscv/kvm/vcpu_sbi_sta.c @@ -93,8 +93,8 @@ static int kvm_sbi_sta_steal_time_set_shmem(struct kvm_vcpu *vcpu) if (flags != 0) return SBI_ERR_INVALID_PARAM; - if (shmem_phys_lo == SBI_STA_SHMEM_DISABLE && - shmem_phys_hi == SBI_STA_SHMEM_DISABLE) { + if (shmem_phys_lo == SBI_SHMEM_DISABLE && + shmem_phys_hi == SBI_SHMEM_DISABLE) { vcpu->arch.sta.shmem = INVALID_GPA; return 0; } diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index ce58bc48e5b8..7396b8654f45 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -186,6 +186,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_READONLY_MEM: case KVM_CAP_MP_STATE: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_SET_GUEST_DEBUG: r = 1; break; case KVM_CAP_NR_VCPUS: diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index a2be3aefff9f..f86ad3335529 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -143,6 +143,9 @@ extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfe extern u64 xstate_get_guest_group_perm(void); +extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); + + /* KVM specific functions */ extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu); extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu); diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 110d7f29ca9a..5187fcf4b610 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -121,6 +121,7 @@ KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) #endif +KVM_X86_OP_OPTIONAL(dev_get_attr) KVM_X86_OP_OPTIONAL(mem_enc_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6efd1497b026..69862e8ecab3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -254,28 +254,31 @@ enum x86_intercept_stage; KVM_GUESTDBG_INJECT_DB | \ KVM_GUESTDBG_BLOCKIRQ) +#define PFERR_PRESENT_MASK BIT(0) +#define PFERR_WRITE_MASK BIT(1) +#define PFERR_USER_MASK BIT(2) +#define PFERR_RSVD_MASK BIT(3) +#define PFERR_FETCH_MASK BIT(4) +#define PFERR_PK_MASK BIT(5) +#define PFERR_SGX_MASK BIT(15) +#define PFERR_GUEST_RMP_MASK BIT_ULL(31) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(32) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(33) +#define PFERR_GUEST_ENC_MASK BIT_ULL(34) +#define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) +#define PFERR_GUEST_VMPL_MASK BIT_ULL(36) -#define PFERR_PRESENT_BIT 0 -#define PFERR_WRITE_BIT 1 -#define PFERR_USER_BIT 2 -#define PFERR_RSVD_BIT 3 -#define PFERR_FETCH_BIT 4 -#define PFERR_PK_BIT 5 -#define PFERR_SGX_BIT 15 -#define PFERR_GUEST_FINAL_BIT 32 -#define PFERR_GUEST_PAGE_BIT 33 -#define PFERR_IMPLICIT_ACCESS_BIT 48 - -#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) -#define PFERR_USER_MASK BIT(PFERR_USER_BIT) -#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) -#define PFERR_PK_MASK BIT(PFERR_PK_BIT) -#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) -#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) -#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) -#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) +/* + * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks + * when emulating instructions that triggers implicit access. + */ +#define PFERR_IMPLICIT_ACCESS BIT_ULL(48) +/* + * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred + * when the guest was accessing private memory. + */ +#define PFERR_PRIVATE_ACCESS BIT_ULL(49) +#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ @@ -1280,12 +1283,14 @@ enum kvm_apicv_inhibit { }; struct kvm_arch { - unsigned long vm_type; unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; u8 mmu_valid_gen; + u8 vm_type; + bool has_private_mem; + bool has_protected_state; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; @@ -1312,6 +1317,8 @@ struct kvm_arch { */ spinlock_t mmu_unsync_pages_lock; + u64 shadow_mmio_value; + struct iommu_domain *iommu_domain; bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA @@ -1779,6 +1786,7 @@ struct kvm_x86_ops { void (*enable_smi_window)(struct kvm_vcpu *vcpu); #endif + int (*dev_get_attr)(u32 group, u64 attr, u64 *val); int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); @@ -1844,6 +1852,7 @@ struct kvm_arch_async_pf { gfn_t gfn; unsigned long cr3; bool direct_map; + u64 error_code; }; extern u32 __read_mostly kvm_nr_uret_msrs; @@ -2140,6 +2149,10 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm, kvm_set_or_clear_apicv_inhibit(kvm, reason, false); } +unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, + unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3, + int op_64_bit, int cpl); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, @@ -2153,8 +2166,9 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); + #ifdef CONFIG_KVM_PRIVATE_MEM -#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM) +#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) #else #define kvm_arch_has_private_mem(kvm) false #endif diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index b463fcbd4b90..5a8246dd532f 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -54,8 +54,10 @@ (((unsigned long)fn) << 32)) /* AP Reset Hold */ -#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 -#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 +#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12 +#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0) /* GHCB GPA Register */ #define GHCB_MSR_REG_GPA_REQ 0x012 @@ -99,6 +101,8 @@ enum psc_op { /* GHCB Hypervisor Feature Request/Response */ #define GHCB_MSR_HV_FT_REQ 0x080 #define GHCB_MSR_HV_FT_RESP 0x081 +#define GHCB_MSR_HV_FT_POS 12 +#define GHCB_MSR_HV_FT_MASK GENMASK_ULL(51, 0) #define GHCB_MSR_HV_FT_RESP_VAL(v) \ /* GHCBData[63:12] */ \ (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 4dba17363008..d77a31039f24 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -71,6 +71,7 @@ #define SECONDARY_EXEC_ENCLS_EXITING VMCS_CONTROL_BIT(ENCLS_EXITING) #define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING) #define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING) +#define SECONDARY_EXEC_EPT_VIOLATION_VE VMCS_CONTROL_BIT(EPT_VIOLATION_VE) #define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX) #define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES) #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC) @@ -226,6 +227,8 @@ enum vmcs_field { VMREAD_BITMAP_HIGH = 0x00002027, VMWRITE_BITMAP = 0x00002028, VMWRITE_BITMAP_HIGH = 0x00002029, + VE_INFORMATION_ADDRESS = 0x0000202A, + VE_INFORMATION_ADDRESS_HIGH = 0x0000202B, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, ENCLS_EXITING_BITMAP = 0x0000202E, @@ -514,6 +517,7 @@ enum vmcs_field { #define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_ACCESS_BIT (1ull << 8) #define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63) #define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ VMX_EPT_WRITABLE_MASK | \ VMX_EPT_EXECUTABLE_MASK) @@ -630,4 +634,13 @@ enum vmx_l1d_flush_state { extern enum vmx_l1d_flush_state l1tf_vmx_mitigation; +struct vmx_ve_information { + u32 exit_reason; + u32 delivery; + u64 exit_qualification; + u64 guest_linear_address; + u64 guest_physical_address; + u16 eptp_index; +}; + #endif diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index ef11aa4cab42..9fae1b73b529 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -457,8 +457,13 @@ struct kvm_sync_regs { #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 -/* attributes for system fd (group 0) */ -#define KVM_X86_XCOMP_GUEST_SUPP 0 +/* vendor-independent attributes for system fd (group 0) */ +#define KVM_X86_GRP_SYSTEM 0 +# define KVM_X86_XCOMP_GUEST_SUPP 0 + +/* vendor-specific groups and attributes for system fd */ +#define KVM_X86_GRP_SEV 1 +# define KVM_X86_SEV_VMSA_FEATURES 0 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; @@ -689,6 +694,9 @@ enum sev_cmd_id { /* Guest Migration Extension */ KVM_SEV_SEND_CANCEL, + /* Second time is the charm; improved versions of the above ioctls. */ + KVM_SEV_INIT2, + KVM_SEV_NR_MAX, }; @@ -700,6 +708,14 @@ struct kvm_sev_cmd { __u32 sev_fd; }; +struct kvm_sev_init { + __u64 vmsa_features; + __u32 flags; + __u16 ghcb_version; + __u16 pad1; + __u32 pad2[8]; +}; + struct kvm_sev_launch_start { __u32 handle; __u32 policy; @@ -856,5 +872,7 @@ struct kvm_hyperv_eventfd { #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 +#define KVM_X86_SEV_VM 2 +#define KVM_X86_SEV_ES_VM 3 #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 33a214b1a4ce..6d32e415b01e 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -991,6 +991,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) return __raw_xsave_addr(xsave, xfeature_nr); } +EXPORT_SYMBOL_GPL(get_xsave_addr); #ifdef CONFIG_ARCH_HAS_PKEYS diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 19ca623ffa2a..05df04f39628 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -54,8 +54,6 @@ extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void extern void fpu__init_cpu_xstate(void); extern void fpu__init_system_xstate(unsigned int legacy_size); -extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); - static inline u64 xfeatures_mask_supervisor(void) { return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 0ebdd088f28b..d64fb2b3eb69 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -95,6 +95,19 @@ config KVM_INTEL To compile this as a module, choose M here: the module will be called kvm-intel. +config KVM_INTEL_PROVE_VE + bool "Check that guests do not receive #VE exceptions" + default KVM_PROVE_MMU || DEBUG_KERNEL + depends on KVM_INTEL + help + + Checks that KVM's page table management code will not incorrectly + let guests receive a virtualization exception. Virtualization + exceptions will be trapped by the hypervisor rather than injected + in the guest. + + If unsure, say N. + config X86_SGX_KVM bool "Software Guard eXtensions (SGX) Virtualization" depends on X86_SGX && KVM_INTEL diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index addc44fc7187..5494669a055a 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -16,14 +16,15 @@ kvm-$(CONFIG_KVM_XEN) += xen.o kvm-$(CONFIG_KVM_SMM) += smm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ - vmx/nested.o vmx/posted_intr.o + vmx/nested.o vmx/posted_intr.o vmx/main.o kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o -kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \ - svm/sev.o -kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o +kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o + +kvm-amd-$(CONFIG_KVM_AMD_SEV) += svm/sev.o +kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o ifdef CONFIG_HYPERV kvm-y += kvm_onhyperv.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 77352a4abd87..1851b3870a9c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -772,7 +772,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0); kvm_cpu_cap_mask(CPUID_8000_001F_EAX, - 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) | + 0 /* SME */ | 0 /* SEV */ | 0 /* VM_PAGE_FLUSH */ | 0 /* SEV_ES */ | F(SME_COHERENT)); kvm_cpu_cap_mask(CPUID_8000_0021_EAX, diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 60f21bb4c27b..2343c9f00e31 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, */ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; - int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; + int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1; u32 errcode = PFERR_PRESENT_MASK; bool fault; @@ -234,8 +234,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ - offset = (pfec & ~1) + - ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); + offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0); pkru_bits &= mmu->pkru_mask >> offset; errcode |= -pkru_bits & PFERR_PK_MASK; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index db007a4dffa2..c39674d19260 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -432,8 +432,8 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * - * An spte tlb flush may be pending, because kvm_set_pte_rmap - * coalesces them and we are running out of the MMU lock. Therefore + * An spte tlb flush may be pending, because they are coalesced and + * we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * * Reading the spte while an update is in progress may get the old value @@ -567,9 +567,9 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) - __update_clear_spte_fast(sptep, 0ull); + __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); else - old_spte = __update_clear_spte_slow(sptep, 0ull); + old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE); if (!is_shadow_present_pte(old_spte)) return old_spte; @@ -603,7 +603,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) */ static void mmu_spte_clear_no_track(u64 *sptep) { - __update_clear_spte_fast(sptep, 0ull); + __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); } static u64 mmu_spte_get_lockless(u64 *sptep) @@ -1448,49 +1448,11 @@ static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, } static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) + struct kvm_memory_slot *slot, gfn_t gfn, int level) { return __kvm_zap_rmap(kvm, rmap_head, slot); } -static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t pte) -{ - u64 *sptep; - struct rmap_iterator iter; - bool need_flush = false; - u64 new_spte; - kvm_pfn_t new_pfn; - - WARN_ON_ONCE(pte_huge(pte)); - new_pfn = pte_pfn(pte); - -restart: - for_each_rmap_spte(rmap_head, &iter, sptep) { - need_flush = true; - - if (pte_write(pte)) { - kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); - goto restart; - } else { - new_spte = kvm_mmu_changed_pte_notifier_make_spte( - *sptep, new_pfn); - - mmu_spte_clear_track_bits(kvm, sptep); - mmu_spte_set(sptep, new_spte); - } - } - - if (need_flush && kvm_available_flush_remote_tlbs_range()) { - kvm_flush_remote_tlbs_gfn(kvm, gfn, level); - return false; - } - - return need_flush; -} - struct slot_rmap_walk_iterator { /* input fields. */ const struct kvm_memory_slot *slot; @@ -1562,7 +1524,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, - int level, pte_t pte); + int level); static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, @@ -1574,7 +1536,7 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, range->start, range->end - 1, &iterator) ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, - iterator.level, range->arg.pte); + iterator.level); return ret; } @@ -1596,22 +1558,8 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) return flush; } -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - bool flush = false; - - if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); - - if (tdp_mmu_enabled) - flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); - - return flush; -} - static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) + struct kvm_memory_slot *slot, gfn_t gfn, int level) { u64 *sptep; struct rmap_iterator iter; @@ -1624,8 +1572,7 @@ static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, } static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, pte_t unused) + struct kvm_memory_slot *slot, gfn_t gfn, int level) { u64 *sptep; struct rmap_iterator iter; @@ -1950,7 +1897,8 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { - if (!sp->spt[i]) + /* sp->spt[i] has initial value of shadow page table allocation */ + if (sp->spt[i] == SHADOW_NONPRESENT_VALUE) return 0; return vcpu->arch.mmu->sync_spte(vcpu, sp, i); @@ -2514,7 +2462,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } - } else if (is_mmio_spte(pte)) { + } else if (is_mmio_spte(kvm, pte)) { mmu_spte_clear_no_track(spte); } return 0; @@ -3314,9 +3262,19 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, { gva_t gva = fault->is_tdp ? 0 : fault->addr; + if (fault->is_private) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); + fault->slot = NULL; + fault->pfn = KVM_PFN_NOSLOT; + fault->map_writable = false; + fault->hva = KVM_HVA_ERR_BAD; + /* * If MMIO caching is disabled, emulate immediately without * touching the shadow page tables as attempting to install an @@ -4196,7 +4154,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (WARN_ON_ONCE(reserved)) return -EINVAL; - if (is_mmio_spte(spte)) { + if (is_mmio_spte(vcpu->kvm, spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned int access = get_mmio_spte_access(spte); @@ -4259,24 +4217,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu) return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; } -static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { struct kvm_arch_async_pf arch; arch.token = alloc_apf_token(vcpu); - arch.gfn = gfn; + arch.gfn = fault->gfn; + arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); - return kvm_setup_async_pf(vcpu, cr2_or_gpa, - kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); + return kvm_setup_async_pf(vcpu, fault->addr, + kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); } void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { int r; + if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) + return; + if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || work->wakeup_all) return; @@ -4289,7 +4251,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL); + kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL); } static inline u8 kvm_max_level_for_order(int order) @@ -4309,14 +4271,6 @@ static inline u8 kvm_max_level_for_order(int order) return PG_LEVEL_4K; } -static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault) -{ - kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, - PAGE_SIZE, fault->write, fault->exec, - fault->is_private); -} - static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { @@ -4343,48 +4297,15 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_memory_slot *slot = fault->slot; bool async; - /* - * Retry the page fault if the gfn hit a memslot that is being deleted - * or moved. This ensures any existing SPTEs for the old memslot will - * be zapped before KVM inserts a new MMIO SPTE for the gfn. - */ - if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) - return RET_PF_RETRY; - - if (!kvm_is_visible_memslot(slot)) { - /* Don't expose private memslots to L2. */ - if (is_guest_mode(vcpu)) { - fault->slot = NULL; - fault->pfn = KVM_PFN_NOSLOT; - fault->map_writable = false; - return RET_PF_CONTINUE; - } - /* - * If the APIC access page exists but is disabled, go directly - * to emulation without caching the MMIO access or creating a - * MMIO SPTE. That way the cache doesn't need to be purged - * when the AVIC is re-enabled. - */ - if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && - !kvm_apicv_activated(vcpu->kvm)) - return RET_PF_EMULATE; - } - - if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { - kvm_mmu_prepare_memory_fault_exit(vcpu, fault); - return -EFAULT; - } - if (fault->is_private) return kvm_faultin_pfn_private(vcpu, fault); async = false; - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, - fault->write, &fault->map_writable, - &fault->hva); + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false, + &async, fault->write, + &fault->map_writable, &fault->hva); if (!async) return RET_PF_CONTINUE; /* *pfn has correct page already */ @@ -4394,7 +4315,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return RET_PF_RETRY; - } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) { + } else if (kvm_arch_setup_async_pf(vcpu, fault)) { return RET_PF_RETRY; } } @@ -4404,17 +4325,72 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL, - fault->write, &fault->map_writable, - &fault->hva); + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, + NULL, fault->write, + &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; } static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { + struct kvm_memory_slot *slot = fault->slot; int ret; + /* + * Note that the mmu_invalidate_seq also serves to detect a concurrent + * change in attributes. is_page_fault_stale() will detect an + * invalidation relate to fault->fn and resume the guest without + * installing a mapping in the page tables. + */ + fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; + smp_rmb(); + + /* + * Now that we have a snapshot of mmu_invalidate_seq we can check for a + * private vs. shared mismatch. + */ + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + + if (unlikely(!slot)) + return kvm_handle_noslot_fault(vcpu, fault, access); + + /* + * Retry the page fault if the gfn hit a memslot that is being deleted + * or moved. This ensures any existing SPTEs for the old memslot will + * be zapped before KVM inserts a new MMIO SPTE for the gfn. + */ + if (slot->flags & KVM_MEMSLOT_INVALID) + return RET_PF_RETRY; + + if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { + /* + * Don't map L1's APIC access page into L2, KVM doesn't support + * using APICv/AVIC to accelerate L2 accesses to L1's APIC, + * i.e. the access needs to be emulated. Emulating access to + * L1's APIC is also correct if L1 is accelerating L2's own + * virtual APIC, but for some reason L1 also maps _L1's_ APIC + * into L2. Note, vcpu_is_mmio_gpa() always treats access to + * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM + * uses different roots for L1 vs. L2, i.e. there is no danger + * of breaking APICv/AVIC for L1. + */ + if (is_guest_mode(vcpu)) + return kvm_handle_noslot_fault(vcpu, fault, access); + + /* + * If the APIC access page exists but is disabled, go directly + * to emulation without caching the MMIO access or creating a + * MMIO SPTE. That way the cache doesn't need to be purged + * when the AVIC is re-enabled. + */ + if (!kvm_apicv_activated(vcpu->kvm)) + return RET_PF_EMULATE; + } + fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); @@ -4439,8 +4415,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held * to detect retry guarantees the worst case latency for the vCPU. */ - if (fault->slot && - mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) + if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; ret = __kvm_faultin_pfn(vcpu, fault); @@ -4450,7 +4425,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (unlikely(is_error_pfn(fault->pfn))) return kvm_handle_error_pfn(vcpu, fault); - if (unlikely(!fault->slot)) + if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn))) return kvm_handle_noslot_fault(vcpu, fault, access); /* @@ -4561,6 +4536,16 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (WARN_ON_ONCE(fault_address >> 32)) return -EFAULT; #endif + /* + * Legacy #PF exception only have a 32-bit error code. Simply drop the + * upper bits as KVM doesn't use them for #PF (because they are never + * set), and to ensure there are no collisions with KVM-defined bits. + */ + if (WARN_ON_ONCE(error_code >> 32)) + error_code = lower_32_bits(error_code); + + /* Ensure the above sanity check also covers KVM-defined flags. */ + BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; if (!flags) { @@ -4812,7 +4797,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access) { - if (unlikely(is_mmio_spte(*sptep))) { + if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { mmu_spte_clear_no_track(sptep); return true; @@ -5846,30 +5831,35 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; - /* - * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP - * checks when emulating instructions that triggers implicit access. - * WARN if hardware generates a fault with an error code that collides - * with the KVM-defined value. Clear the flag and continue on, i.e. - * don't terminate the VM, as KVM can't possibly be relying on a flag - * that KVM doesn't know about. - */ - if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS)) - error_code &= ~PFERR_IMPLICIT_ACCESS; - if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; + /* + * Except for reserved faults (emulated MMIO is shared-only), set the + * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's + * current attributes, which are the source of truth for such VMs. Note, + * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't + * currently supported nested virtualization (among many other things) + * for software-protected VMs. + */ + if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && + !(error_code & PFERR_RSVD_MASK) && + vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM && + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) + error_code |= PFERR_PRIVATE_ACCESS; + r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { + if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS)) + return -EFAULT; + r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { - r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, - lower_32_bits(error_code), false, + r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, &emulation_type); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; @@ -6173,7 +6163,10 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; - vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu_shadow_page_cache.init_value = + SHADOW_NONPRESENT_VALUE; + if (!vcpu->arch.mmu_shadow_page_cache.init_value) + vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; @@ -6316,6 +6309,7 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) void kvm_mmu_init_vm(struct kvm *kvm) { + kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 5390a591a571..ce2fcd19ba6b 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -190,7 +190,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm) struct kvm_page_fault { /* arguments to kvm_mmu_do_page_fault. */ const gpa_t addr; - const u32 error_code; + const u64 error_code; const bool prefetch; /* Derived from error_code. */ @@ -279,8 +279,16 @@ enum { RET_PF_SPURIOUS, }; +static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, + PAGE_SIZE, fault->write, fault->exec, + fault->is_private); +} + static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 err, bool prefetch, int *emulation_type) + u64 err, bool prefetch, int *emulation_type) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, @@ -298,7 +306,10 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, .goal_level = PG_LEVEL_4K, - .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT), + .is_private = err & PFERR_PRIVATE_ACCESS, + + .pfn = KVM_PFN_ERR_FAULT, + .hva = KVM_HVA_ERR_BAD, }; int r; @@ -320,6 +331,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, else r = vcpu->arch.mmu->page_fault(vcpu, &fault); + /* + * Not sure what's happening, but punt to userspace and hope that + * they can fix it by changing memory to shared, or they can + * provide a better error. + */ + if (r == RET_PF_EMULATE && fault.is_private) { + pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n"); + kvm_mmu_prepare_memory_fault_exit(vcpu, &fault); + return -EFAULT; + } + if (fault.write_fault_to_shadow_pgtable && emulation_type) *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index ae86820cef69..195d98bc8de8 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -260,7 +260,7 @@ TRACE_EVENT( TP_STRUCT__entry( __field(int, vcpu_id) __field(gpa_t, cr2_or_gpa) - __field(u32, error_code) + __field(u64, error_code) __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4d4e98fe4f35..9aac3aa93d88 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -911,7 +911,7 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int gpa_t pte_gpa; gfn_t gfn; - if (WARN_ON_ONCE(!sp->spt[i])) + if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE)) return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); @@ -933,13 +933,13 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int return 0; /* - * Drop the SPTE if the new protections would result in a RWX=0 - * SPTE or if the gfn is changing. The RWX=0 case only affects - * EPT with execute-only support, i.e. EPT without an effective - * "present" bit, as all other paging modes will create a - * read-only SPTE if pte_access is zero. + * Drop the SPTE if the new protections result in no effective + * "present" bit or if the gfn is changing. The former case + * only affects EPT with execute-only support with pte_access==0; + * all other paging modes will create a read-only SPTE if + * pte_access is zero. */ - if ((!pte_access && !shadow_present_mask) || + if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE || gfn != kvm_mmu_page_get_gfn(sp, i)) { drop_spte(vcpu->kvm, &sp->spt[i]); return 1; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 4a599130e9c9..a5e014d7bc62 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -74,10 +74,10 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - WARN_ON_ONCE(!shadow_mmio_value); + WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value); access &= shadow_mmio_access_mask; - spte |= shadow_mmio_value | access; + spte |= vcpu->kvm->arch.shadow_mmio_value | access; spte |= gpa | shadow_nonpresent_or_rsvd_mask; spte |= (gpa & shadow_nonpresent_or_rsvd_mask) << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; @@ -144,19 +144,19 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; - WARN_ON_ONCE(!pte_access && !shadow_present_mask); + /* + * For the EPT case, shadow_present_mask has no RWX bits set if + * exec-only page table entries are supported. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE); if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED; else if (kvm_mmu_page_ad_need_write_protect(sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY; - /* - * For the EPT case, shadow_present_mask is 0 if hardware - * supports exec-only page table entries. In that case, - * ACC_USER_MASK and shadow_user_mask are used to represent - * read access. See FNAME(gpte_access) in paging_tmpl.h. - */ spte |= shadow_present_mask; if (!prefetch) spte |= spte_shadow_accessed_mask(spte); @@ -322,22 +322,6 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) return spte; } -u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) -{ - u64 new_spte; - - new_spte = old_spte & ~SPTE_BASE_ADDR_MASK; - new_spte |= (u64)new_pfn << PAGE_SHIFT; - - new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~shadow_host_writable_mask; - new_spte &= ~shadow_mmu_writable_mask; - - new_spte = mark_spte_for_access_track(new_spte); - - return new_spte; -} - u64 mark_spte_for_access_track(u64 spte) { if (spte_ad_enabled(spte)) @@ -429,7 +413,9 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; - shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ + shadow_present_mask = + (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; /* * EPT overrides the host MTRRs, and so KVM must program the desired * memtype directly into the SPTEs. Note, this mask is just the mask @@ -446,7 +432,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) * of an EPT paging-structure entry is 110b (write/execute). */ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, - VMX_EPT_RWX_MASK, 0); + VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index a129951c9a88..5dd5405fa07a 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -149,6 +149,22 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) +/* + * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress + * #VE and get EPT violations on non-present PTEs. We can use the + * same value also without TDX for both VMX and SVM: + * + * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored. + * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0) + * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1) + */ +#ifdef CONFIG_X86_64 +#define SHADOW_NONPRESENT_VALUE BIT_ULL(63) +static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); +#else +#define SHADOW_NONPRESENT_VALUE 0ULL +#endif + extern u64 __read_mostly shadow_host_writable_mask; extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; @@ -190,11 +206,11 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF - * vulnerability. Use only low bits to avoid 64-bit immediates. + * vulnerability. * * Only used by the TDP MMU. */ -#define REMOVED_SPTE 0x5a0ULL +#define REMOVED_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) /* Removed SPTEs must not be misconstrued as shadow present PTEs. */ static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); @@ -249,9 +265,9 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root) return spte_to_child_sp(root); } -static inline bool is_mmio_spte(u64 spte) +static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) { - return (spte & shadow_mmio_mask) == shadow_mmio_value && + return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value && likely(enable_mmio_caching); } @@ -496,8 +512,6 @@ static inline u64 restore_acc_track_spte(u64 spte) return spte; } -u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); - void __init kvm_mmu_spte_module_init(void); void kvm_mmu_reset_all_pte_masks(void); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 04c1f0957fea..b5be7e949bd8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -495,8 +495,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * impact the guest since both the former and current SPTEs * are nonpresent. */ - if (WARN_ON_ONCE(!is_mmio_spte(old_spte) && - !is_mmio_spte(new_spte) && + if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) && + !is_mmio_spte(kvm, new_spte) && !is_removed_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" @@ -603,7 +603,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * here since the SPTE is going from non-present to non-present. Use * the raw write helper to avoid an unnecessary check on volatile bits. */ - __kvm_tdp_mmu_write_spte(iter->sptep, 0); + __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE); return 0; } @@ -740,8 +740,8 @@ retry: continue; if (!shared) - tdp_mmu_iter_set_spte(kvm, &iter, 0); - else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) + tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); + else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) goto retry; } } @@ -808,8 +808,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) return false; - tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, - sp->gfn, sp->role.level + 1); + tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, + SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1); return true; } @@ -843,7 +843,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - tdp_mmu_iter_set_spte(kvm, &iter, 0); + tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); /* * Zappings SPTEs in invalid roots doesn't require a TLB flush, @@ -1028,7 +1028,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, } /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ - if (unlikely(is_mmio_spte(new_spte))) { + if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) { vcpu->stat.pf_mmio_spte_created++; trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, new_spte); @@ -1258,52 +1258,6 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); } -static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) -{ - u64 new_spte; - - /* Huge pages aren't expected to be modified without first being zapped. */ - WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end); - - if (iter->level != PG_LEVEL_4K || - !is_shadow_present_pte(iter->old_spte)) - return false; - - /* - * Note, when changing a read-only SPTE, it's not strictly necessary to - * zero the SPTE before setting the new PFN, but doing so preserves the - * invariant that the PFN of a present * leaf SPTE can never change. - * See handle_changed_spte(). - */ - tdp_mmu_iter_set_spte(kvm, iter, 0); - - if (!pte_write(range->arg.pte)) { - new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, - pte_pfn(range->arg.pte)); - - tdp_mmu_iter_set_spte(kvm, iter, new_spte); - } - - return true; -} - -/* - * Handle the changed_pte MMU notifier for the TDP MMU. - * data is a pointer to the new pte_t mapping the HVA specified by the MMU - * notifier. - * Returns non-zero if a flush is needed before releasing the MMU lock. - */ -bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - /* - * No need to handle the remote TLB flush under RCU protection, the - * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a - * shadow page. See the WARN on pfn_changed in handle_changed_spte(). - */ - return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); -} - /* * Remove write access from all SPTEs at or above min_level that map GFNs * [start, end). Returns true if an SPTE has been changed and the TLBs need to diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 6e1ea04ca885..58b55e61bd33 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -31,7 +31,6 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush); bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, const struct kvm_memory_slot *slot, int min_level); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 759581bb2128..0623cfaa7bb0 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -23,6 +23,7 @@ #include <asm/pkru.h> #include <asm/trapnr.h> #include <asm/fpu/xcr.h> +#include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include "mmu.h" @@ -32,22 +33,12 @@ #include "cpuid.h" #include "trace.h" -#ifndef CONFIG_KVM_AMD_SEV -/* - * When this config is not defined, SEV feature is not supported and APIs in - * this file are not used but this file still gets compiled into the KVM AMD - * module. - * - * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum - * misc_res_type {} defined in linux/misc_cgroup.h. - * - * Below macros allow compilation to succeed. - */ -#define MISC_CG_RES_SEV MISC_CG_RES_TYPES -#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES -#endif +#define GHCB_VERSION_MAX 2ULL +#define GHCB_VERSION_DEFAULT 2ULL +#define GHCB_VERSION_MIN 1ULL + +#define GHCB_HV_FT_SUPPORTED GHCB_HV_FT_SNP -#ifdef CONFIG_KVM_AMD_SEV /* enable/disable SEV support */ static bool sev_enabled = true; module_param_named(sev, sev_enabled, bool, 0444); @@ -57,13 +48,13 @@ static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); /* enable/disable SEV-ES DebugSwap support */ -static bool sev_es_debug_swap_enabled = false; +static bool sev_es_debug_swap_enabled = true; module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); -#else -#define sev_enabled false -#define sev_es_enabled false -#define sev_es_debug_swap_enabled false -#endif /* CONFIG_KVM_AMD_SEV */ +static u64 sev_supported_vmsa_features; + +#define AP_RESET_HOLD_NONE 0 +#define AP_RESET_HOLD_NAE_EVENT 1 +#define AP_RESET_HOLD_MSR_PROTO 2 static u8 sev_enc_bit; static DECLARE_RWSEM(sev_deactivate_lock); @@ -113,7 +104,15 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) static inline bool is_mirroring_enc_context(struct kvm *kvm) { - return !!to_kvm_svm(kvm)->sev_info.enc_context_owner; + return !!to_kvm_sev_info(kvm)->enc_context_owner; +} + +static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + + return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; } /* Must be called with the sev_bitmap_lock held */ @@ -251,20 +250,44 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) sev_decommission(handle); } -static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, + struct kvm_sev_init *data, + unsigned long vm_type) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_platform_init_args init_args = {0}; + bool es_active = vm_type != KVM_X86_SEV_VM; + u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; int ret; if (kvm->created_vcpus) return -EINVAL; + if (data->flags) + return -EINVAL; + + if (data->vmsa_features & ~valid_vmsa_features) + return -EINVAL; + + if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) + return -EINVAL; + if (unlikely(sev->active)) return -EINVAL; sev->active = true; - sev->es_active = argp->id == KVM_SEV_ES_INIT; + sev->es_active = es_active; + sev->vmsa_features = data->vmsa_features; + sev->ghcb_version = data->ghcb_version; + + /* + * Currently KVM supports the full range of mandatory features defined + * by version 2 of the GHCB protocol, so default to that for SEV-ES + * guests created via KVM_SEV_INIT2. + */ + if (sev->es_active && !sev->ghcb_version) + sev->ghcb_version = GHCB_VERSION_DEFAULT; + ret = sev_asid_new(sev); if (ret) goto e_no_asid; @@ -276,6 +299,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) INIT_LIST_HEAD(&sev->regions_list); INIT_LIST_HEAD(&sev->mirror_vms); + sev->need_init = false; kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV); @@ -286,11 +310,53 @@ e_free: sev_asid_free(sev); sev->asid = 0; e_no_asid: + sev->vmsa_features = 0; sev->es_active = false; sev->active = false; return ret; } +static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_init data = { + .vmsa_features = 0, + .ghcb_version = 0, + }; + unsigned long vm_type; + + if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM) + return -EINVAL; + + vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM); + + /* + * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will + * continue to only ever support the minimal GHCB protocol version. + */ + if (vm_type == KVM_X86_SEV_ES_VM) + data.ghcb_version = GHCB_VERSION_MIN; + + return __sev_guest_init(kvm, argp, &data, vm_type); +} + +static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_init data; + + if (!sev->need_init) + return -EINVAL; + + if (kvm->arch.vm_type != KVM_X86_SEV_VM && + kvm->arch.vm_type != KVM_X86_SEV_ES_VM) + return -EINVAL; + + if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data))) + return -EFAULT; + + return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type); +} + static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) { unsigned int asid = sev_get_asid(kvm); @@ -339,7 +405,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; memset(&start, 0, sizeof(start)); @@ -383,7 +449,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) /* return handle to userspace */ params.handle = start.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) { + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) { sev_unbind_asid(kvm, start.handle); ret = -EFAULT; goto e_free_session; @@ -522,7 +588,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; vaddr = params.uaddr; @@ -580,7 +646,13 @@ e_unpin: static int sev_es_sync_vmsa(struct vcpu_svm *svm) { + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; struct sev_es_save_area *save = svm->sev_es.vmsa; + struct xregs_state *xsave; + const u8 *s; + u8 *d; + int i; /* Check some debug related fields before encrypting the VMSA */ if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) @@ -621,10 +693,44 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; - if (sev_es_debug_swap_enabled) { - save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP; - pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. " - "This will not work starting with Linux 6.10\n"); + save->sev_features = sev->vmsa_features; + + /* + * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid + * breaking older measurements. + */ + if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) { + xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave; + save->x87_dp = xsave->i387.rdp; + save->mxcsr = xsave->i387.mxcsr; + save->x87_ftw = xsave->i387.twd; + save->x87_fsw = xsave->i387.swd; + save->x87_fcw = xsave->i387.cwd; + save->x87_fop = xsave->i387.fop; + save->x87_ds = 0; + save->x87_cs = 0; + save->x87_rip = xsave->i387.rip; + + for (i = 0; i < 8; i++) { + /* + * The format of the x87 save area is undocumented and + * definitely not what you would expect. It consists of + * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes + * area with bytes 8-9 of each register. + */ + d = save->fpreg_x87 + i * 8; + s = ((u8 *)xsave->i387.st_space) + i * 16; + memcpy(d, s, 8); + save->fpreg_x87[64 + i * 2] = s[8]; + save->fpreg_x87[64 + i * 2 + 1] = s[9]; + } + memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256); + + s = get_xsave_addr(xsave, XFEATURE_YMM); + if (s) + memcpy(save->fpreg_ymm, s, 256); + else + memset(save->fpreg_ymm, 0, 256); } pr_debug("Virtual Machine Save Area (VMSA):\n"); @@ -658,13 +764,20 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); vmsa.reserved = 0; - vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; + vmsa.handle = to_kvm_sev_info(kvm)->handle; vmsa.address = __sme_pa(svm->sev_es.vmsa); vmsa.len = PAGE_SIZE; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); if (ret) return ret; + /* + * SEV-ES guests maintain an encrypted version of their FPU + * state which is restored and saved on VMRUN and VMEXIT. + * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't + * do xsave/xrstor on it. + */ + fpstate_set_confidential(&vcpu->arch.guest_fpu); vcpu->arch.guest_state_protected = true; return 0; } @@ -695,7 +808,7 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { - void __user *measure = (void __user *)(uintptr_t)argp->data; + void __user *measure = u64_to_user_ptr(argp->data); struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_measure data; struct kvm_sev_launch_measure params; @@ -715,7 +828,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!params.len) goto cmd; - p = (void __user *)(uintptr_t)params.uaddr; + p = u64_to_user_ptr(params.uaddr); if (p) { if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; @@ -788,7 +901,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) params.state = data.state; params.handle = data.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) ret = -EFAULT; return ret; @@ -953,7 +1066,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug))) + if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug))) return -EFAULT; if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr) @@ -1037,7 +1150,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); @@ -1101,7 +1214,7 @@ e_unpin_memory: static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) { - void __user *report = (void __user *)(uintptr_t)argp->data; + void __user *report = u64_to_user_ptr(argp->data); struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_attestation_report data; struct kvm_sev_attestation_report params; @@ -1112,7 +1225,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; memset(&data, 0, sizeof(data)); @@ -1121,7 +1234,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!params.len) goto cmd; - p = (void __user *)(uintptr_t)params.uaddr; + p = u64_to_user_ptr(params.uaddr); if (p) { if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; @@ -1174,7 +1287,7 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); params->session_len = data.session_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, params, + if (copy_to_user(u64_to_user_ptr(argp->data), params, sizeof(struct kvm_sev_send_start))) ret = -EFAULT; @@ -1193,7 +1306,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_send_start))) return -EFAULT; @@ -1248,7 +1361,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); - if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr, + if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr), session_data, params.session_len)) { ret = -EFAULT; goto e_free_amd_cert; @@ -1256,7 +1369,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) params.policy = data.policy; params.session_len = data.session_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(struct kvm_sev_send_start))) ret = -EFAULT; @@ -1287,7 +1400,7 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, params->hdr_len = data.hdr_len; params->trans_len = data.trans_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, params, + if (copy_to_user(u64_to_user_ptr(argp->data), params, sizeof(struct kvm_sev_send_update_data))) ret = -EFAULT; @@ -1307,7 +1420,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_send_update_data))) return -EFAULT; @@ -1358,14 +1471,14 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_trans_data; /* copy transport buffer to user space */ - if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr, + if (copy_to_user(u64_to_user_ptr(params.trans_uaddr), trans_data, params.trans_len)) { ret = -EFAULT; goto e_free_trans_data; } /* Copy packet header to userspace. */ - if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr, + if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr, params.hdr_len)) ret = -EFAULT; @@ -1417,7 +1530,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) return -ENOTTY; /* Get parameter from the userspace */ - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_receive_start))) return -EFAULT; @@ -1459,7 +1572,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) } params.handle = start.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(struct kvm_sev_receive_start))) { ret = -EFAULT; sev_unbind_asid(kvm, start.handle); @@ -1490,7 +1603,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -EINVAL; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_receive_update_data))) return -EFAULT; @@ -1705,6 +1818,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) dst->pages_locked = src->pages_locked; dst->enc_context_owner = src->enc_context_owner; dst->es_active = src->es_active; + dst->vmsa_features = src->vmsa_features; src->asid = 0; src->active = false; @@ -1812,7 +1926,8 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) if (ret) goto out_fput; - if (sev_guest(kvm) || !sev_guest(source_kvm)) { + if (kvm->arch.vm_type != source_kvm->arch.vm_type || + sev_guest(kvm) || !sev_guest(source_kvm)) { ret = -EINVAL; goto out_unlock; } @@ -1861,6 +1976,21 @@ out_fput: return ret; } +int sev_dev_get_attr(u32 group, u64 attr, u64 *val) +{ + if (group != KVM_X86_GRP_SEV) + return -ENXIO; + + switch (attr) { + case KVM_X86_SEV_VMSA_FEATURES: + *val = sev_supported_vmsa_features; + return 0; + + default: + return -ENXIO; + } +} + int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -1894,6 +2024,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_INIT: r = sev_guest_init(kvm, &sev_cmd); break; + case KVM_SEV_INIT2: + r = sev_guest_init2(kvm, &sev_cmd); + break; case KVM_SEV_LAUNCH_START: r = sev_launch_start(kvm, &sev_cmd); break; @@ -2121,6 +2254,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) mirror_sev->asid = source_sev->asid; mirror_sev->fd = source_sev->fd; mirror_sev->es_active = source_sev->es_active; + mirror_sev->need_init = false; mirror_sev->handle = source_sev->handle; INIT_LIST_HEAD(&mirror_sev->regions_list); INIT_LIST_HEAD(&mirror_sev->mirror_vms); @@ -2186,15 +2320,18 @@ void sev_vm_destroy(struct kvm *kvm) void __init sev_set_cpu_caps(void) { - if (!sev_enabled) - kvm_cpu_cap_clear(X86_FEATURE_SEV); - if (!sev_es_enabled) - kvm_cpu_cap_clear(X86_FEATURE_SEV_ES); + if (sev_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM); + } + if (sev_es_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV_ES); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM); + } } void __init sev_hardware_setup(void) { -#ifdef CONFIG_KVM_AMD_SEV unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; bool sev_es_supported = false; bool sev_supported = false; @@ -2294,7 +2431,10 @@ out: if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) sev_es_debug_swap_enabled = false; -#endif + + sev_supported_vmsa_features = 0; + if (sev_es_debug_swap_enabled) + sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; } void sev_hardware_unsetup(void) @@ -2585,6 +2725,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_VMGEXIT_AP_HLT_LOOP: case SVM_VMGEXIT_AP_JUMP_TABLE: case SVM_VMGEXIT_UNSUPPORTED_EVENT: + case SVM_VMGEXIT_HV_FEATURES: + case SVM_VMGEXIT_TERM_REQUEST: break; default: reason = GHCB_ERR_INVALID_EVENT; @@ -2615,6 +2757,9 @@ vmgexit_err: void sev_es_unmap_ghcb(struct vcpu_svm *svm) { + /* Clear any indication that the vCPU is in a type of AP Reset Hold */ + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; + if (!svm->sev_es.ghcb) return; @@ -2774,6 +2919,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; u64 ghcb_info; int ret = 1; @@ -2784,7 +2930,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) switch (ghcb_info) { case GHCB_MSR_SEV_INFO_REQ: - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); break; @@ -2826,6 +2972,28 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) GHCB_MSR_INFO_POS); break; } + case GHCB_MSR_AP_RESET_HOLD_REQ: + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO; + ret = kvm_emulate_ap_reset_hold(&svm->vcpu); + + /* + * Preset the result to a non-SIPI return and then only set + * the result to non-zero when delivering a SIPI. + */ + set_ghcb_msr_bits(svm, 0, + GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, + GHCB_MSR_AP_RESET_HOLD_RESULT_POS); + + set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_HV_FT_REQ: + set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED, + GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, + GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); + break; case GHCB_MSR_TERM_REQ: { u64 reason_set, reason_code; @@ -2925,6 +3093,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = 1; break; case SVM_VMGEXIT_AP_HLT_LOOP: + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT; ret = kvm_emulate_ap_reset_hold(vcpu); break; case SVM_VMGEXIT_AP_JUMP_TABLE: { @@ -2949,6 +3118,19 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = 1; break; } + case SVM_VMGEXIT_HV_FEATURES: + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_HV_FT_SUPPORTED); + + ret = 1; + break; + case SVM_VMGEXIT_TERM_REQUEST: + pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n", + control->exit_info_1, control->exit_info_2); + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", @@ -3063,7 +3245,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, TRAP_CR8_WRITE); vmcb->control.intercepts[INTERCEPT_DR] = 0; - if (!sev_es_debug_swap_enabled) { + if (!sev_vcpu_has_debug_swap(svm)) { vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); recalc_intercepts(svm); @@ -3109,16 +3291,19 @@ void sev_init_vmcb(struct vcpu_svm *svm) void sev_es_vcpu_reset(struct vcpu_svm *svm) { + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + /* * Set the GHCB MSR value as per the GHCB specification when emulating * vCPU RESET for an SEV-ES guest. */ - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); } -void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) +void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { /* * All host state for SEV-ES guests is categorized into three swap types @@ -3146,7 +3331,7 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both * saves and loads debug registers (Type-A). */ - if (sev_es_debug_swap_enabled) { + if (sev_vcpu_has_debug_swap(svm)) { hostsa->dr0 = native_get_debugreg(0); hostsa->dr1 = native_get_debugreg(1); hostsa->dr2 = native_get_debugreg(2); @@ -3168,15 +3353,31 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) return; } - /* - * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where - * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a - * non-zero value. - */ - if (!svm->sev_es.ghcb) - return; + /* Subsequent SIPI */ + switch (svm->sev_es.ap_reset_hold_type) { + case AP_RESET_HOLD_NAE_EVENT: + /* + * Return from an AP Reset Hold VMGEXIT, where the guest will + * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. + */ + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); + break; + case AP_RESET_HOLD_MSR_PROTO: + /* + * Return from an AP Reset Hold VMGEXIT, where the guest will + * set the CS and RIP. Set GHCB data field to a non-zero value. + */ + set_ghcb_msr_bits(svm, 1, + GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, + GHCB_MSR_AP_RESET_HOLD_RESULT_POS); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); + set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + default: + break; + } } struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9aaf83c8d57d..c8dc25886c16 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1433,14 +1433,6 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) vmsa_page = snp_safe_alloc_page(vcpu); if (!vmsa_page) goto error_free_vmcb_page; - - /* - * SEV-ES guests maintain an encrypted version of their FPU - * state which is restored and saved on VMRUN and VMEXIT. - * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't - * do xsave/xrstor on it. - */ - fpstate_set_confidential(&vcpu->arch.guest_fpu); } err = avic_init_vcpu(svm); @@ -1525,7 +1517,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) */ vmsave(sd->save_area_pa); if (sev_es_guest(vcpu->kvm)) - sev_es_prepare_switch_to_guest(sev_es_host_save_area(sd)); + sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd)); if (tsc_scaling) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); @@ -2056,6 +2048,15 @@ static int npf_interception(struct kvm_vcpu *vcpu) u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; + /* + * WARN if hardware generates a fault with an error code that collides + * with KVM-defined sythentic flags. Clear the flags and continue on, + * i.e. don't terminate the VM, as KVM can't possibly be relying on a + * flag that KVM doesn't know about. + */ + if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) + error_code &= ~PFERR_SYNTHETIC_MASK; + trace_kvm_page_fault(vcpu, fault_address, error_code); return kvm_mmu_page_fault(vcpu, fault_address, error_code, static_cpu_has(X86_FEATURE_DECODEASSISTS) ? @@ -3304,7 +3305,9 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, +#ifdef CONFIG_KVM_AMD_SEV [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, +#endif }; static void dump_vmcb(struct kvm_vcpu *vcpu) @@ -4085,6 +4088,9 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) { + if (to_kvm_sev_info(vcpu->kvm)->need_init) + return -EINVAL; + return 1; } @@ -4892,6 +4898,14 @@ static void svm_vm_destroy(struct kvm *kvm) static int svm_vm_init(struct kvm *kvm) { + int type = kvm->arch.vm_type; + + if (type != KVM_X86_DEFAULT_VM && + type != KVM_X86_SW_PROTECTED_VM) { + kvm->arch.has_protected_state = (type == KVM_X86_SEV_ES_VM); + to_kvm_sev_info(kvm)->need_init = true; + } + if (!pause_filter_count || !pause_filter_thresh) kvm->arch.pause_in_guest = true; @@ -5026,6 +5040,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_smi_window = svm_enable_smi_window, #endif +#ifdef CONFIG_KVM_AMD_SEV + .dev_get_attr = sev_dev_get_attr, .mem_enc_ioctl = sev_mem_enc_ioctl, .mem_enc_register_region = sev_mem_enc_register_region, .mem_enc_unregister_region = sev_mem_enc_unregister_region, @@ -5033,7 +5049,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, .vm_move_enc_context_from = sev_vm_move_enc_context_from, - +#endif .check_emulate_instruction = svm_check_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 33878efdebc8..be57213cd295 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -79,12 +79,15 @@ enum { struct kvm_sev_info { bool active; /* SEV enabled guest */ bool es_active; /* SEV-ES enabled guest */ + bool need_init; /* waiting for SEV_INIT2 */ unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ + u64 vmsa_features; + u16 ghcb_version; /* Highest guest GHCB protocol version allowed */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ struct list_head mirror_vms; /* List of VMs mirroring */ struct list_head mirror_entry; /* Use as a list entry of mirrors */ @@ -197,6 +200,7 @@ struct vcpu_sev_es_state { u8 valid_bitmap[16]; struct kvm_host_map ghcb_map; bool received_first_sipi; + unsigned int ap_reset_hold_type; /* SEV-ES scratch area support */ u64 sw_scratch; @@ -318,6 +322,11 @@ static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) return container_of(kvm, struct kvm_svm, kvm); } +static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm) +{ + return &to_kvm_svm(kvm)->sev_info; +} + static __always_inline bool sev_guest(struct kvm *kvm) { #ifdef CONFIG_KVM_AMD_SEV @@ -664,13 +673,16 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu); /* sev.c */ -#define GHCB_VERSION_MAX 1ULL -#define GHCB_VERSION_MIN 1ULL - - -extern unsigned int max_sev_asid; +void pre_sev_run(struct vcpu_svm *svm, int cpu); +void sev_init_vmcb(struct vcpu_svm *svm); +void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); +int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); +void sev_es_vcpu_reset(struct vcpu_svm *svm); +void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); +void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa); +void sev_es_unmap_ghcb(struct vcpu_svm *svm); -void sev_vm_destroy(struct kvm *kvm); +#ifdef CONFIG_KVM_AMD_SEV int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp); int sev_mem_enc_register_region(struct kvm *kvm, struct kvm_enc_region *range); @@ -679,22 +691,32 @@ int sev_mem_enc_unregister_region(struct kvm *kvm, int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd); int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd); void sev_guest_memory_reclaimed(struct kvm *kvm); +int sev_handle_vmgexit(struct kvm_vcpu *vcpu); -void pre_sev_run(struct vcpu_svm *svm, int cpu); +/* These symbols are used in common code and are stubbed below. */ +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); +void sev_free_vcpu(struct kvm_vcpu *vcpu); +void sev_vm_destroy(struct kvm *kvm); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); -void sev_init_vmcb(struct vcpu_svm *svm); -void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); -void sev_free_vcpu(struct kvm_vcpu *vcpu); -int sev_handle_vmgexit(struct kvm_vcpu *vcpu); -int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); -void sev_es_vcpu_reset(struct vcpu_svm *svm); -void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); -void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa); -void sev_es_unmap_ghcb(struct vcpu_svm *svm); -struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); +int sev_dev_get_attr(u32 group, u64 attr, u64 *val); +extern unsigned int max_sev_asid; +#else +static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) { + return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); +} + +static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {} +static inline void sev_vm_destroy(struct kvm *kvm) {} +static inline void __init sev_set_cpu_caps(void) {} +static inline void __init sev_hardware_setup(void) {} +static inline void sev_hardware_unsetup(void) {} +static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; } +static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; } +#define max_sev_asid 0 +#endif /* vmenter.S */ diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c new file mode 100644 index 000000000000..7c546ad3e4c9 --- /dev/null +++ b/arch/x86/kvm/vmx/main.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/moduleparam.h> + +#include "x86_ops.h" +#include "vmx.h" +#include "nested.h" +#include "pmu.h" + +#define VMX_REQUIRED_APICV_INHIBITS \ + (BIT(APICV_INHIBIT_REASON_DISABLE)| \ + BIT(APICV_INHIBIT_REASON_ABSENT) | \ + BIT(APICV_INHIBIT_REASON_HYPERV) | \ + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED)) + +struct kvm_x86_ops vt_x86_ops __initdata = { + .name = KBUILD_MODNAME, + + .check_processor_compatibility = vmx_check_processor_compat, + + .hardware_unsetup = vmx_hardware_unsetup, + + .hardware_enable = vmx_hardware_enable, + .hardware_disable = vmx_hardware_disable, + .has_emulated_msr = vmx_has_emulated_msr, + + .vm_size = sizeof(struct kvm_vmx), + .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, + + .vcpu_precreate = vmx_vcpu_precreate, + .vcpu_create = vmx_vcpu_create, + .vcpu_free = vmx_vcpu_free, + .vcpu_reset = vmx_vcpu_reset, + + .prepare_switch_to_guest = vmx_prepare_switch_to_guest, + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + + .update_exception_bitmap = vmx_update_exception_bitmap, + .get_msr_feature = vmx_get_msr_feature, + .get_msr = vmx_get_msr, + .set_msr = vmx_set_msr, + .get_segment_base = vmx_get_segment_base, + .get_segment = vmx_get_segment, + .set_segment = vmx_set_segment, + .get_cpl = vmx_get_cpl, + .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .is_valid_cr0 = vmx_is_valid_cr0, + .set_cr0 = vmx_set_cr0, + .is_valid_cr4 = vmx_is_valid_cr4, + .set_cr4 = vmx_set_cr4, + .set_efer = vmx_set_efer, + .get_idt = vmx_get_idt, + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, + .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, + .cache_reg = vmx_cache_reg, + .get_rflags = vmx_get_rflags, + .set_rflags = vmx_set_rflags, + .get_if_flag = vmx_get_if_flag, + + .flush_tlb_all = vmx_flush_tlb_all, + .flush_tlb_current = vmx_flush_tlb_current, + .flush_tlb_gva = vmx_flush_tlb_gva, + .flush_tlb_guest = vmx_flush_tlb_guest, + + .vcpu_pre_run = vmx_vcpu_pre_run, + .vcpu_run = vmx_vcpu_run, + .handle_exit = vmx_handle_exit, + .skip_emulated_instruction = vmx_skip_emulated_instruction, + .update_emulated_instruction = vmx_update_emulated_instruction, + .set_interrupt_shadow = vmx_set_interrupt_shadow, + .get_interrupt_shadow = vmx_get_interrupt_shadow, + .patch_hypercall = vmx_patch_hypercall, + .inject_irq = vmx_inject_irq, + .inject_nmi = vmx_inject_nmi, + .inject_exception = vmx_inject_exception, + .cancel_injection = vmx_cancel_injection, + .interrupt_allowed = vmx_interrupt_allowed, + .nmi_allowed = vmx_nmi_allowed, + .get_nmi_mask = vmx_get_nmi_mask, + .set_nmi_mask = vmx_set_nmi_mask, + .enable_nmi_window = vmx_enable_nmi_window, + .enable_irq_window = vmx_enable_irq_window, + .update_cr8_intercept = vmx_update_cr8_intercept, + .set_virtual_apic_mode = vmx_set_virtual_apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, + .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, + .load_eoi_exitmap = vmx_load_eoi_exitmap, + .apicv_pre_state_restore = vmx_apicv_pre_state_restore, + .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, + .hwapic_irr_update = vmx_hwapic_irr_update, + .hwapic_isr_update = vmx_hwapic_isr_update, + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, + .sync_pir_to_irr = vmx_sync_pir_to_irr, + .deliver_interrupt = vmx_deliver_interrupt, + .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, + + .set_tss_addr = vmx_set_tss_addr, + .set_identity_map_addr = vmx_set_identity_map_addr, + .get_mt_mask = vmx_get_mt_mask, + + .get_exit_info = vmx_get_exit_info, + + .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, + + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + + .get_l2_tsc_offset = vmx_get_l2_tsc_offset, + .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, + .write_tsc_offset = vmx_write_tsc_offset, + .write_tsc_multiplier = vmx_write_tsc_multiplier, + + .load_mmu_pgd = vmx_load_mmu_pgd, + + .check_intercept = vmx_check_intercept, + .handle_exit_irqoff = vmx_handle_exit_irqoff, + + .sched_in = vmx_sched_in, + + .cpu_dirty_log_size = PML_ENTITY_NUM, + .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, + + .nested_ops = &vmx_nested_ops, + + .pi_update_irte = vmx_pi_update_irte, + .pi_start_assignment = vmx_pi_start_assignment, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vmx_set_hv_timer, + .cancel_hv_timer = vmx_cancel_hv_timer, +#endif + + .setup_mce = vmx_setup_mce, + +#ifdef CONFIG_KVM_SMM + .smi_allowed = vmx_smi_allowed, + .enter_smm = vmx_enter_smm, + .leave_smm = vmx_leave_smm, + .enable_smi_window = vmx_enable_smi_window, +#endif + + .check_emulate_instruction = vmx_check_emulate_instruction, + .apic_init_signal_blocked = vmx_apic_init_signal_blocked, + .migrate_timers = vmx_migrate_timers, + + .msr_filter_changed = vmx_msr_filter_changed, + .complete_emulated_msr = kvm_complete_insn_gp, + + .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, + + .get_untagged_addr = vmx_get_untagged_addr, +}; + +struct kvm_x86_init_ops vt_init_ops __initdata = { + .hardware_setup = vmx_hardware_setup, + .handle_intel_pt_intr = NULL, + + .runtime_ops = &vt_x86_ops, + .pmu_ops = &intel_pmu_ops, +}; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 7c1996b433e2..b25625314658 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info) return is_exception_n(intr_info, NM_VECTOR); } +static inline bool is_ve_fault(u32 intr_info) +{ + return is_exception_n(intr_info, VE_VECTOR); +} + /* Undocumented: icebp/int1 */ static inline bool is_icebp(u32 intr_info) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 22411f4aff53..ad36e5eb6667 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -68,6 +68,7 @@ #include "vmcs12.h" #include "vmx.h" #include "x86.h" +#include "x86_ops.h" #include "smm.h" #include "vmx_onhyperv.h" @@ -530,8 +531,6 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static unsigned long host_idt_base; #if IS_ENABLED(CONFIG_HYPERV) -static struct kvm_x86_ops vmx_x86_ops __initdata; - static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -581,9 +580,8 @@ static __init void hv_init_evmcs(void) } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_l2_tlb_flush + vt_x86_ops.enable_l2_tlb_flush = hv_enable_l2_tlb_flush; - } else { enlightened_vmcs = false; } @@ -874,6 +872,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); /* + * #VE isn't used for VMX. To test against unexpected changes + * related to #VE for VMX, intercept unexpected #VE and warn on it. + */ + if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) + eb |= 1u << VE_VECTOR; + /* * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway @@ -1477,7 +1481,7 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ -static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1488,7 +1492,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx->host_debugctlmsr = get_debugctlmsr(); } -static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); @@ -1547,7 +1551,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmx->emulation_required = vmx_emulation_required(vcpu); } -static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) +bool vmx_get_if_flag(struct kvm_vcpu *vcpu) { return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; } @@ -1653,8 +1657,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does @@ -1738,7 +1742,7 @@ rip_updated: * Recognizes a pending MTF VM-exit and records the nested state for later * delivery. */ -static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) +void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1769,7 +1773,7 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) } } -static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) +int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) { vmx_update_emulated_instruction(vcpu); return skip_emulated_instruction(vcpu); @@ -1788,7 +1792,7 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); } -static void vmx_inject_exception(struct kvm_vcpu *vcpu) +void vmx_inject_exception(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; @@ -1909,12 +1913,12 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return kvm_caps.default_tsc_scaling_ratio; } -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) +void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) { vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); } -static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) +void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) { vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); } @@ -1957,7 +1961,7 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, return !(msr->data & ~valid_bits); } -static int vmx_get_msr_feature(struct kvm_msr_entry *msr) +int vmx_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: @@ -1974,7 +1978,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; @@ -2155,7 +2159,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; @@ -2458,7 +2462,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return ret; } -static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { unsigned long guest_owned_bits; @@ -2606,6 +2610,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_cpu_based_2nd_exec_control)) return -EIO; } + if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; + #ifndef CONFIG_X86_64 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) @@ -2630,6 +2637,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, return -EIO; vmx_cap->ept = 0; + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && vmx_cap->vpid) { @@ -2759,7 +2767,7 @@ static bool kvm_is_vmx_supported(void) return supported; } -static int vmx_check_processor_compat(void) +int vmx_check_processor_compat(void) { int cpu = raw_smp_processor_id(); struct vmcs_config vmcs_conf; @@ -2801,7 +2809,7 @@ fault: return -EFAULT; } -static int vmx_hardware_enable(void) +int vmx_hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2841,7 +2849,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -static void vmx_hardware_disable(void) +void vmx_hardware_disable(void) { vmclear_local_loaded_vmcss(); @@ -3155,7 +3163,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3185,7 +3193,7 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->vpid; } -static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u64 root_hpa = mmu->root.hpa; @@ -3201,7 +3209,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) vpid_sync_context(vmx_get_current_vpid(vcpu)); } -static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in @@ -3210,7 +3218,7 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } -static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a @@ -3255,7 +3263,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ CPU_BASED_CR3_STORE_EXITING) -static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (is_guest_mode(vcpu)) return nested_guest_cr0_valid(vcpu, cr0); @@ -3376,8 +3384,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) return eptp; } -static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, - int root_level) +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3406,8 +3413,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcs_writel(GUEST_CR3, guest_cr3); } - -static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be @@ -3523,7 +3529,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) var->g = (ar >> 15) & 1; } -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment s; @@ -3600,14 +3606,14 @@ void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); } -static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { __vmx_set_segment(vcpu, var, seg); to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); } -static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); @@ -3615,25 +3621,25 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) *l = (ar >> 13) & 1; } -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_IDTR_LIMIT); dt->address = vmcs_readl(GUEST_IDTR_BASE); } -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_IDTR_LIMIT, dt->size); vmcs_writel(GUEST_IDTR_BASE, dt->address); } -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_GDTR_LIMIT); dt->address = vmcs_readl(GUEST_GDTR_BASE); } -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_GDTR_LIMIT, dt->size); vmcs_writel(GUEST_GDTR_BASE, dt->address); @@ -4101,7 +4107,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } -static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); void *vapic_page; @@ -4121,7 +4127,7 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) return ((rvi & 0xf0) > (vppr & 0xf0)); } -static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) +void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 i; @@ -4265,8 +4271,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) return 0; } -static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, - int trig_mode, int vector) +void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) { struct kvm_vcpu *vcpu = apic->vcpu; @@ -4428,7 +4434,7 @@ static u32 vmx_vmexit_ctrl(void) ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); } -static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4594,6 +4600,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; enable_unrestricted_guest = 0; } if (!enable_unrestricted_guest) @@ -4692,7 +4699,7 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) return 0; } -static int vmx_vcpu_precreate(struct kvm *kvm) +int vmx_vcpu_precreate(struct kvm *kvm) { return vmx_alloc_ipiv_pid_table(kvm); } @@ -4717,8 +4724,12 @@ static void init_vmcs(struct vcpu_vmx *vmx) exec_controls_set(vmx, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) + if (cpu_has_secondary_exec_ctrls()) { secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + if (vmx->ve_info) + vmcs_write64(VE_INFORMATION_ADDRESS, + __pa(vmx->ve_info)); + } if (cpu_has_tertiary_exec_ctrls()) tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); @@ -4847,7 +4858,7 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->pi_desc.sn = 1; } -static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4906,12 +4917,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_update_fb_clear_dis(vcpu, vmx); } -static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) +void vmx_enable_irq_window(struct kvm_vcpu *vcpu) { exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } -static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) +void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) { if (!enable_vnmi || vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { @@ -4922,7 +4933,7 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) +void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; @@ -4950,7 +4961,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) vmx_clear_hlt(vcpu); } -static void vmx_inject_nmi(struct kvm_vcpu *vcpu) +void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5028,7 +5039,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) GUEST_INTR_STATE_NMI)); } -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; @@ -5050,7 +5061,7 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } -static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; @@ -5065,7 +5076,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !vmx_interrupt_blocked(vcpu); } -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) +int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { void __user *ret; @@ -5085,7 +5096,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) return init_rmode_tss(kvm, ret); } -static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; return 0; @@ -5206,6 +5217,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); + if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm)) + return -EIO; + error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); @@ -5371,8 +5385,7 @@ static int handle_io(struct kvm_vcpu *vcpu) return kvm_fast_pio(vcpu, size, port, in); } -static void -vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) { /* * Patch in the VMCALL instruction: @@ -5578,7 +5591,7 @@ out: return kvm_complete_insn_gp(vcpu, err); } -static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { get_debugreg(vcpu->arch.db[0], 0); get_debugreg(vcpu->arch.db[1], 1); @@ -5597,7 +5610,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } -static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); } @@ -5868,7 +5881,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 1; } -static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) { if (vmx_emulation_required_with_pending_exception(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); @@ -6156,9 +6169,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, - u64 *info1, u64 *info2, - u32 *intr_info, u32 *error_code) +void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6416,6 +6428,24 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) pr_err("Virtual processor ID = 0x%04x\n", vmcs_read16(VIRTUAL_PROCESSOR_ID)); + if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { + struct vmx_ve_information *ve_info = vmx->ve_info; + u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); + + /* + * If KVM is dumping the VMCS, then something has gone wrong + * already. Derefencing an address from the VMCS, which could + * very well be corrupted, is a terrible idea. The virtual + * address is known so use it. + */ + pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, + ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); + pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", + ve_info->exit_reason, ve_info->delivery, + ve_info->exit_qualification, + ve_info->guest_linear_address, + ve_info->guest_physical_address, ve_info->eptp_index); + } } /* @@ -6601,7 +6631,7 @@ unexpected_vmexit: return 0; } -static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { int ret = __vmx_handle_exit(vcpu, exit_fastpath); @@ -6689,7 +6719,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) : "eax", "ebx", "ecx", "edx"); } -static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int tpr_threshold; @@ -6759,7 +6789,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap_x2apic(vcpu); } -static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) +void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; @@ -6828,7 +6858,7 @@ out: kvm_release_pfn_clean(pfn); } -static void vmx_hwapic_isr_update(int max_isr) +void vmx_hwapic_isr_update(int max_isr) { u16 status; u8 old; @@ -6862,7 +6892,7 @@ static void vmx_set_rvi(int vector) } } -static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { /* * When running L2, updating RVI is only relevant when @@ -6876,7 +6906,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) vmx_set_rvi(max_irr); } -static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; @@ -6922,7 +6952,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } -static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) return; @@ -6933,7 +6963,7 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) +void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6964,24 +6994,22 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } -static void handle_exception_irqoff(struct vcpu_vmx *vmx) +static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) { - u32 intr_info = vmx_get_intr_info(&vmx->vcpu); - /* if exit due to PF check for async PF */ if (is_page_fault(intr_info)) - vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); + vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* if exit due to NM, handle before interrupts are enabled */ else if (is_nm_fault(intr_info)) - handle_nm_fault_irqoff(&vmx->vcpu); + handle_nm_fault_irqoff(vcpu); /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); } -static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) +static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, + u32 intr_info) { - u32 intr_info = vmx_get_intr_info(vcpu); unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, @@ -6998,7 +7026,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) vcpu->arch.at_instruction_boundary = true; } -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) +void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7006,16 +7034,16 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) return; if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) - handle_external_interrupt_irqoff(vcpu); + handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) - handle_exception_irqoff(vmx); + handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); } /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. */ -static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) +bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: @@ -7138,7 +7166,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) IDT_VECTORING_ERROR_CODE); } -static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +void vmx_cancel_injection(struct kvm_vcpu *vcpu) { __vmx_complete_interrupts(vcpu, vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), @@ -7308,7 +7336,7 @@ out: guest_state_exit_irqoff(); } -static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -7463,7 +7491,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); } -static void vmx_vcpu_free(struct kvm_vcpu *vcpu) +void vmx_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7472,9 +7500,10 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu) free_vpid(vmx->vpid); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); + free_page((unsigned long)vmx->ve_info); } -static int vmx_vcpu_create(struct kvm_vcpu *vcpu) +int vmx_vcpu_create(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; @@ -7565,6 +7594,20 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) goto free_vmcs; } + err = -ENOMEM; + if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { + struct page *page; + + BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); + + /* ve_info must be page aligned. */ + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!page) + goto free_vmcs; + + vmx->ve_info = page_to_virt(page); + } + if (vmx_can_use_ipiv(vcpu)) WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); @@ -7583,7 +7626,7 @@ free_vpid: #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" -static int vmx_vm_init(struct kvm *kvm) +int vmx_vm_init(struct kvm *kvm) { if (!ple_gap) kvm->arch.pause_in_guest = true; @@ -7614,7 +7657,7 @@ static int vmx_vm_init(struct kvm *kvm) return 0; } -static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in * memory aliases with conflicting memory types and sometimes MCEs. @@ -7786,7 +7829,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } -static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8001,10 +8044,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; } -static int vmx_check_intercept(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info, - enum x86_intercept_stage stage, - struct x86_exception *exception) +int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage, + struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -8084,8 +8127,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, return 0; } -static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, - bool *expired) +int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; @@ -8124,13 +8167,13 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, return 0; } -static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) +void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) { to_vmx(vcpu)->hv_deadline_tsc = -1; } #endif -static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) +void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) { if (!kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); @@ -8159,7 +8202,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static void vmx_setup_mce(struct kvm_vcpu *vcpu) +void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= @@ -8170,7 +8213,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_SMM -static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) @@ -8178,7 +8221,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !is_smm(vcpu); } -static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) +int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8199,7 +8242,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) return 0; } -static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) +int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -8220,18 +8263,18 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) return 0; } -static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) +void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } #endif -static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); } -static void vmx_migrate_timers(struct kvm_vcpu *vcpu) +void vmx_migrate_timers(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; @@ -8241,7 +8284,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu) } } -static void vmx_hardware_unsetup(void) +void vmx_hardware_unsetup(void) { kvm_set_posted_intr_wakeup_handler(NULL); @@ -8251,18 +8294,7 @@ static void vmx_hardware_unsetup(void) free_kvm_area(); } -#define VMX_REQUIRED_APICV_INHIBITS \ -( \ - BIT(APICV_INHIBIT_REASON_DISABLE)| \ - BIT(APICV_INHIBIT_REASON_ABSENT) | \ - BIT(APICV_INHIBIT_REASON_HYPERV) | \ - BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ - BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ - BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \ -) - -static void vmx_vm_destroy(struct kvm *kvm) +void vmx_vm_destroy(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); @@ -8313,148 +8345,6 @@ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); } -static struct kvm_x86_ops vmx_x86_ops __initdata = { - .name = KBUILD_MODNAME, - - .check_processor_compatibility = vmx_check_processor_compat, - - .hardware_unsetup = vmx_hardware_unsetup, - - .hardware_enable = vmx_hardware_enable, - .hardware_disable = vmx_hardware_disable, - .has_emulated_msr = vmx_has_emulated_msr, - - .vm_size = sizeof(struct kvm_vmx), - .vm_init = vmx_vm_init, - .vm_destroy = vmx_vm_destroy, - - .vcpu_precreate = vmx_vcpu_precreate, - .vcpu_create = vmx_vcpu_create, - .vcpu_free = vmx_vcpu_free, - .vcpu_reset = vmx_vcpu_reset, - - .prepare_switch_to_guest = vmx_prepare_switch_to_guest, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, - - .update_exception_bitmap = vmx_update_exception_bitmap, - .get_msr_feature = vmx_get_msr_feature, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .is_valid_cr0 = vmx_is_valid_cr0, - .set_cr0 = vmx_set_cr0, - .is_valid_cr4 = vmx_is_valid_cr4, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .set_dr7 = vmx_set_dr7, - .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - .get_if_flag = vmx_get_if_flag, - - .flush_tlb_all = vmx_flush_tlb_all, - .flush_tlb_current = vmx_flush_tlb_current, - .flush_tlb_gva = vmx_flush_tlb_gva, - .flush_tlb_guest = vmx_flush_tlb_guest, - - .vcpu_pre_run = vmx_vcpu_pre_run, - .vcpu_run = vmx_vcpu_run, - .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = vmx_skip_emulated_instruction, - .update_emulated_instruction = vmx_update_emulated_instruction, - .set_interrupt_shadow = vmx_set_interrupt_shadow, - .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, - .inject_irq = vmx_inject_irq, - .inject_nmi = vmx_inject_nmi, - .inject_exception = vmx_inject_exception, - .cancel_injection = vmx_cancel_injection, - .interrupt_allowed = vmx_interrupt_allowed, - .nmi_allowed = vmx_nmi_allowed, - .get_nmi_mask = vmx_get_nmi_mask, - .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = vmx_enable_nmi_window, - .enable_irq_window = vmx_enable_irq_window, - .update_cr8_intercept = vmx_update_cr8_intercept, - .set_virtual_apic_mode = vmx_set_virtual_apic_mode, - .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_pre_state_restore = vmx_apicv_pre_state_restore, - .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, - .hwapic_irr_update = vmx_hwapic_irr_update, - .hwapic_isr_update = vmx_hwapic_isr_update, - .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_interrupt = vmx_deliver_interrupt, - .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, - - .set_tss_addr = vmx_set_tss_addr, - .set_identity_map_addr = vmx_set_identity_map_addr, - .get_mt_mask = vmx_get_mt_mask, - - .get_exit_info = vmx_get_exit_info, - - .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, - - .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - - .get_l2_tsc_offset = vmx_get_l2_tsc_offset, - .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, - .write_tsc_offset = vmx_write_tsc_offset, - .write_tsc_multiplier = vmx_write_tsc_multiplier, - - .load_mmu_pgd = vmx_load_mmu_pgd, - - .check_intercept = vmx_check_intercept, - .handle_exit_irqoff = vmx_handle_exit_irqoff, - - .sched_in = vmx_sched_in, - - .cpu_dirty_log_size = PML_ENTITY_NUM, - .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - - .nested_ops = &vmx_nested_ops, - - .pi_update_irte = vmx_pi_update_irte, - .pi_start_assignment = vmx_pi_start_assignment, - -#ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, -#endif - - .setup_mce = vmx_setup_mce, - -#ifdef CONFIG_KVM_SMM - .smi_allowed = vmx_smi_allowed, - .enter_smm = vmx_enter_smm, - .leave_smm = vmx_leave_smm, - .enable_smi_window = vmx_enable_smi_window, -#endif - - .check_emulate_instruction = vmx_check_emulate_instruction, - .apic_init_signal_blocked = vmx_apic_init_signal_blocked, - .migrate_timers = vmx_migrate_timers, - - .msr_filter_changed = vmx_msr_filter_changed, - .complete_emulated_msr = kvm_complete_insn_gp, - - .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, - - .get_untagged_addr = vmx_get_untagged_addr, -}; - static unsigned int vmx_handle_intel_pt_intr(void) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); @@ -8520,9 +8410,7 @@ static void __init vmx_setup_me_spte_mask(void) kvm_mmu_set_me_spte_mask(0, me_mask); } -static struct kvm_x86_init_ops vmx_init_ops __initdata; - -static __init int hardware_setup(void) +__init int vmx_hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; @@ -8591,16 +8479,16 @@ static __init int hardware_setup(void) * using the APIC_ACCESS_ADDR VMCS field. */ if (!flexpriority_enabled) - vmx_x86_ops.set_apic_access_page_addr = NULL; + vt_x86_ops.set_apic_access_page_addr = NULL; if (!cpu_has_vmx_tpr_shadow()) - vmx_x86_ops.update_cr8_intercept = NULL; + vt_x86_ops.update_cr8_intercept = NULL; #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH && enable_ept) { - vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; - vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; + vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; + vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; } #endif @@ -8615,7 +8503,7 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_apicv()) enable_apicv = 0; if (!enable_apicv) - vmx_x86_ops.sync_pir_to_irr = NULL; + vt_x86_ops.sync_pir_to_irr = NULL; if (!enable_apicv || !cpu_has_vmx_ipiv()) enable_ipiv = false; @@ -8651,7 +8539,7 @@ static __init int hardware_setup(void) enable_pml = 0; if (!enable_pml) - vmx_x86_ops.cpu_dirty_log_size = 0; + vt_x86_ops.cpu_dirty_log_size = 0; if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; @@ -8676,8 +8564,8 @@ static __init int hardware_setup(void) } if (!enable_preemption_timer) { - vmx_x86_ops.set_hv_timer = NULL; - vmx_x86_ops.cancel_hv_timer = NULL; + vt_x86_ops.set_hv_timer = NULL; + vt_x86_ops.cancel_hv_timer = NULL; } kvm_caps.supported_mce_cap |= MCG_LMCE_P; @@ -8688,9 +8576,9 @@ static __init int hardware_setup(void) if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; if (pt_mode == PT_MODE_HOST_GUEST) - vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; else - vmx_init_ops.handle_intel_pt_intr = NULL; + vt_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); @@ -8713,14 +8601,6 @@ static __init int hardware_setup(void) return r; } -static struct kvm_x86_init_ops vmx_init_ops __initdata = { - .hardware_setup = hardware_setup, - .handle_intel_pt_intr = NULL, - - .runtime_ops = &vmx_x86_ops, - .pmu_ops = &intel_pmu_ops, -}; - static void vmx_cleanup_l1d_flush(void) { if (vmx_l1d_flush_pages) { @@ -8762,7 +8642,7 @@ static int __init vmx_init(void) */ hv_init_evmcs(); - r = kvm_x86_vendor_init(&vmx_init_ops); + r = kvm_x86_vendor_init(&vt_init_ops); if (r) return r; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 90f9e4434646..e4a605c5808e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -365,6 +365,9 @@ struct vcpu_vmx { DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); } shadow_msr_intercept; + + /* ve_info must be page aligned. */ + struct vmx_ve_information *ve_info; }; struct kvm_vmx { @@ -577,7 +580,8 @@ static inline u8 vmx_get_rvi(void) SECONDARY_EXEC_ENABLE_VMFUNC | \ SECONDARY_EXEC_BUS_LOCK_DETECTION | \ SECONDARY_EXEC_NOTIFY_VM_EXITING | \ - SECONDARY_EXEC_ENCLS_EXITING) + SECONDARY_EXEC_ENCLS_EXITING | \ + SECONDARY_EXEC_EPT_VIOLATION_VE) #define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0 #define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \ diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h new file mode 100644 index 000000000000..502704596c83 --- /dev/null +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_X86_OPS_H +#define __KVM_X86_VMX_X86_OPS_H + +#include <linux/kvm_host.h> + +#include "x86.h" + +__init int vmx_hardware_setup(void); + +extern struct kvm_x86_ops vt_x86_ops __initdata; +extern struct kvm_x86_init_ops vt_init_ops __initdata; + +void vmx_hardware_unsetup(void); +int vmx_check_processor_compat(void); +int vmx_hardware_enable(void); +void vmx_hardware_disable(void); +int vmx_vm_init(struct kvm *kvm); +void vmx_vm_destroy(struct kvm *kvm); +int vmx_vcpu_precreate(struct kvm *kvm); +int vmx_vcpu_create(struct kvm_vcpu *vcpu); +int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu); +fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); +void vmx_vcpu_free(struct kvm_vcpu *vcpu); +void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void vmx_vcpu_put(struct kvm_vcpu *vcpu); +int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath); +void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu); +int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu); +void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu); +int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +#ifdef CONFIG_KVM_SMM +int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection); +int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram); +int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram); +void vmx_enable_smi_window(struct kvm_vcpu *vcpu); +#endif +int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); +int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage, + struct x86_exception *exception); +bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu); +void vmx_migrate_timers(struct kvm_vcpu *vcpu); +void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); +void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu); +bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); +void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); +void vmx_hwapic_isr_update(int max_isr); +bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu); +int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu); +void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector); +void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); +bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); +void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); +void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); +int vmx_get_msr_feature(struct kvm_msr_entry *msr); +int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); +void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +int vmx_get_cpl(struct kvm_vcpu *vcpu); +void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); +bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); +void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); +void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); +void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); +void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); +unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); +void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +bool vmx_get_if_flag(struct kvm_vcpu *vcpu); +void vmx_flush_tlb_all(struct kvm_vcpu *vcpu); +void vmx_flush_tlb_current(struct kvm_vcpu *vcpu); +void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr); +void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu); +void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); +u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); +void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall); +void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected); +void vmx_inject_nmi(struct kvm_vcpu *vcpu); +void vmx_inject_exception(struct kvm_vcpu *vcpu); +void vmx_cancel_injection(struct kvm_vcpu *vcpu); +int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection); +int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection); +bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); +void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); +void vmx_enable_nmi_window(struct kvm_vcpu *vcpu); +void vmx_enable_irq_window(struct kvm_vcpu *vcpu); +void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr); +void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu); +void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); +void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); +int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); +int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr); +u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); +void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); +u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); +u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); +void vmx_write_tsc_offset(struct kvm_vcpu *vcpu); +void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu); +void vmx_request_immediate_exit(struct kvm_vcpu *vcpu); +void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu); +void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); +#ifdef CONFIG_X86_64 +int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired); +void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu); +#endif +void vmx_setup_mce(struct kvm_vcpu *vcpu); + +#endif /* __KVM_X86_VMX_X86_OPS_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 91478b769af0..fda22b3800a1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,9 +92,12 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -struct kvm_caps kvm_caps __read_mostly = { - .supported_mce_cap = MCG_CTL_P | MCG_SER_P, -}; +/* + * Note, kvm_caps fields should *never* have default values, all fields must be + * recomputed from scratch during vendor module load, e.g. to account for a + * vendor module being reloaded with different module parameters. + */ +struct kvm_caps kvm_caps __read_mostly; EXPORT_SYMBOL_GPL(kvm_caps); #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) @@ -4629,9 +4632,7 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, static bool kvm_is_vm_type_supported(unsigned long type) { - return type == KVM_X86_DEFAULT_VM || - (type == KVM_X86_SW_PROTECTED_VM && - IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled); + return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) @@ -4832,9 +4833,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = kvm_caps.has_notify_vmexit; break; case KVM_CAP_VM_TYPES: - r = BIT(KVM_X86_DEFAULT_VM); - if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM)) - r |= BIT(KVM_X86_SW_PROTECTED_VM); + r = kvm_caps.supported_vm_types; break; default: break; @@ -4842,46 +4841,44 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) return r; } -static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr) -{ - void __user *uaddr = (void __user*)(unsigned long)attr->addr; - - if ((u64)(unsigned long)uaddr != attr->addr) - return ERR_PTR_USR(-EFAULT); - return uaddr; -} - -static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) +static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val) { - u64 __user *uaddr = kvm_get_attr_addr(attr); - - if (attr->group) + if (attr->group) { + if (kvm_x86_ops.dev_get_attr) + return static_call(kvm_x86_dev_get_attr)(attr->group, attr->attr, val); return -ENXIO; - - if (IS_ERR(uaddr)) - return PTR_ERR(uaddr); + } switch (attr->attr) { case KVM_X86_XCOMP_GUEST_SUPP: - if (put_user(kvm_caps.supported_xcr0, uaddr)) - return -EFAULT; + *val = kvm_caps.supported_xcr0; return 0; default: return -ENXIO; } } +static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) +{ + u64 __user *uaddr = u64_to_user_ptr(attr->addr); + int r; + u64 val; + + r = __kvm_x86_dev_get_attr(attr, &val); + if (r < 0) + return r; + + if (put_user(val, uaddr)) + return -EFAULT; + + return 0; +} + static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr) { - if (attr->group) - return -ENXIO; + u64 val; - switch (attr->attr) { - case KVM_X86_XCOMP_GUEST_SUPP: - return 0; - default: - return -ENXIO; - } + return __kvm_x86_dev_get_attr(attr, &val); } long kvm_arch_dev_ioctl(struct file *filp, @@ -5557,11 +5554,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return 0; } -static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, - struct kvm_debugregs *dbgregs) +static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs) { unsigned int i; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + memset(dbgregs, 0, sizeof(*dbgregs)); BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db)); @@ -5570,6 +5571,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, dbgregs->dr6 = vcpu->arch.dr6; dbgregs->dr7 = vcpu->arch.dr7; + return 0; } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -5577,6 +5579,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, { unsigned int i; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + if (dbgregs->flags) return -EINVAL; @@ -5597,8 +5603,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, } -static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, - u8 *state, unsigned int size) +static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, + u8 *state, unsigned int size) { /* * Only copy state for features that are enabled for the guest. The @@ -5616,24 +5622,25 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, XFEATURE_MASK_FPSSE; if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size, supported_xcr0, vcpu->arch.pkru); + return 0; } -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) +static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) { - kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, - sizeof(guest_xsave->region)); + return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, + sizeof(guest_xsave->region)); } static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return 0; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, guest_xsave->region, @@ -5641,18 +5648,23 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, &vcpu->arch.pkru); } -static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, - struct kvm_xcrs *guest_xcrs) +static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { guest_xcrs->nr_xcrs = 0; - return; + return 0; } guest_xcrs->nr_xcrs = 1; guest_xcrs->flags = 0; guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; + return 0; } static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, @@ -5660,6 +5672,10 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, { int i, r = 0; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -EINVAL; @@ -5712,12 +5728,9 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - u64 __user *uaddr = kvm_get_attr_addr(attr); + u64 __user *uaddr = u64_to_user_ptr(attr->addr); int r; - if (IS_ERR(uaddr)) - return PTR_ERR(uaddr); - switch (attr->attr) { case KVM_VCPU_TSC_OFFSET: r = -EFAULT; @@ -5735,13 +5748,10 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - u64 __user *uaddr = kvm_get_attr_addr(attr); + u64 __user *uaddr = u64_to_user_ptr(attr->addr); struct kvm *kvm = vcpu->kvm; int r; - if (IS_ERR(uaddr)) - return PTR_ERR(uaddr); - switch (attr->attr) { case KVM_VCPU_TSC_OFFSET: { u64 offset, tsc, ns; @@ -6048,7 +6058,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_GET_DEBUGREGS: { struct kvm_debugregs dbgregs; - kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); + r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); + if (r < 0) + break; r = -EFAULT; if (copy_to_user(argp, &dbgregs, @@ -6078,7 +6090,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!u.xsave) break; - kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); + r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); + if (r < 0) + break; r = -EFAULT; if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) @@ -6107,7 +6121,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!u.xsave) break; - kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size); + r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size); + if (r < 0) + break; r = -EFAULT; if (copy_to_user(argp, u.xsave, size)) @@ -6123,7 +6139,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!u.xcrs) break; - kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); + r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); + if (r < 0) + break; r = -EFAULT; if (copy_to_user(argp, u.xcrs, @@ -6267,6 +6285,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #endif case KVM_GET_SREGS2: { + r = -EINVAL; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + goto out; + u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL); r = -ENOMEM; if (!u.sregs2) @@ -6279,6 +6302,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_SREGS2: { + r = -EINVAL; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + goto out; + u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2)); if (IS_ERR(u.sregs2)) { r = PTR_ERR(u.sregs2); @@ -9732,6 +9760,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return -EIO; } + memset(&kvm_caps, 0, sizeof(kvm_caps)); + x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("failed to allocate cache for x86 emulator\n"); @@ -9750,6 +9780,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (r) goto out_free_percpu; + kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM); + kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P; + if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; @@ -9795,6 +9828,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) + kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) kvm_caps.supported_xss = 0; @@ -10051,26 +10087,15 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, + unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3, + int op_64_bit, int cpl) { - unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit; - - if (kvm_xen_hypercall_enabled(vcpu->kvm)) - return kvm_xen_hypercall(vcpu); - - if (kvm_hv_hypercall_enabled(vcpu)) - return kvm_hv_hypercall(vcpu); - - nr = kvm_rax_read(vcpu); - a0 = kvm_rbx_read(vcpu); - a1 = kvm_rcx_read(vcpu); - a2 = kvm_rdx_read(vcpu); - a3 = kvm_rsi_read(vcpu); + unsigned long ret; trace_kvm_hypercall(nr, a0, a1, a2, a3); - op_64_bit = is_64_bit_hypercall(vcpu); if (!op_64_bit) { nr &= 0xFFFFFFFF; a0 &= 0xFFFFFFFF; @@ -10079,7 +10104,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } - if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { + if (cpl) { ret = -KVM_EPERM; goto out; } @@ -10140,18 +10165,49 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ); vcpu->arch.complete_userspace_io = complete_hypercall_exit; + /* stat is incremented on completion. */ return 0; } default: ret = -KVM_ENOSYS; break; } + out: + ++vcpu->stat.hypercalls; + return ret; +} +EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall); + +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +{ + unsigned long nr, a0, a1, a2, a3, ret; + int op_64_bit; + int cpl; + + if (kvm_xen_hypercall_enabled(vcpu->kvm)) + return kvm_xen_hypercall(vcpu); + + if (kvm_hv_hypercall_enabled(vcpu)) + return kvm_hv_hypercall(vcpu); + + nr = kvm_rax_read(vcpu); + a0 = kvm_rbx_read(vcpu); + a1 = kvm_rcx_read(vcpu); + a2 = kvm_rdx_read(vcpu); + a3 = kvm_rsi_read(vcpu); + op_64_bit = is_64_bit_hypercall(vcpu); + cpl = static_call(kvm_x86_get_cpl)(vcpu); + + ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl); + if (nr == KVM_HC_MAP_GPA_RANGE && !ret) + /* MAP_GPA tosses the request to the user space. */ + return 0; + if (!op_64_bit) ret = (u32)ret; kvm_rax_write(vcpu, ret); - ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); @@ -11486,6 +11542,10 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); __get_regs(vcpu, regs); vcpu_put(vcpu); @@ -11527,6 +11587,10 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); __set_regs(vcpu, regs); vcpu_put(vcpu); @@ -11599,6 +11663,10 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); __get_sregs(vcpu, sregs); vcpu_put(vcpu); @@ -11866,6 +11934,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, { int ret; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); ret = __set_sregs(vcpu, sregs); vcpu_put(vcpu); @@ -11983,7 +12055,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) struct fxregs_state *fxsave; if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return 0; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; vcpu_load(vcpu); @@ -12006,7 +12078,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) struct fxregs_state *fxsave; if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return 0; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; vcpu_load(vcpu); @@ -12532,6 +12604,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return -EINVAL; kvm->arch.vm_type = type; + kvm->arch.has_private_mem = + (type == KVM_X86_SW_PROTECTED_VM); ret = kvm_page_track_init(kvm); if (ret) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a8b71803777b..d80a4c6b5a38 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -24,6 +24,8 @@ struct kvm_caps { bool has_bus_lock_exit; /* notify VM exit supported? */ bool has_notify_vmexit; + /* bit mask of VM types */ + u32 supported_vm_types; u64 supported_mce_cap; u64 supported_xcr0; diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c index b4efdddb2ad9..78c490e0505a 100644 --- a/drivers/perf/riscv_pmu.c +++ b/drivers/perf/riscv_pmu.c @@ -191,8 +191,6 @@ void riscv_pmu_stop(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu); - WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); - if (!(hwc->state & PERF_HES_STOPPED)) { if (rvpmu->ctr_stop) { rvpmu->ctr_stop(event, 0); @@ -408,6 +406,7 @@ struct riscv_pmu *riscv_pmu_alloc(void) cpuc->n_events = 0; for (i = 0; i < RISCV_MAX_COUNTERS; i++) cpuc->events[i] = NULL; + cpuc->snapshot_addr = NULL; } pmu->pmu = (struct pmu) { .event_init = riscv_pmu_event_init, diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 8cbe6e5f9c39..5d699b06dcb6 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -27,7 +27,7 @@ #define ALT_SBI_PMU_OVERFLOW(__ovl) \ asm volatile(ALTERNATIVE_2( \ - "csrr %0, " __stringify(CSR_SSCOUNTOVF), \ + "csrr %0, " __stringify(CSR_SCOUNTOVF), \ "csrr %0, " __stringify(THEAD_C9XX_CSR_SCOUNTEROF), \ THEAD_VENDOR_ID, ERRATA_THEAD_PMU, \ CONFIG_ERRATA_THEAD_PMU, \ @@ -57,6 +57,11 @@ asm volatile(ALTERNATIVE( \ PMU_FORMAT_ATTR(event, "config:0-47"); PMU_FORMAT_ATTR(firmware, "config:63"); +static bool sbi_v2_available; +static DEFINE_STATIC_KEY_FALSE(sbi_pmu_snapshot_available); +#define sbi_pmu_snapshot_available() \ + static_branch_unlikely(&sbi_pmu_snapshot_available) + static struct attribute *riscv_arch_formats_attr[] = { &format_attr_event.attr, &format_attr_firmware.attr, @@ -384,7 +389,7 @@ static int pmu_sbi_ctr_get_idx(struct perf_event *event) cmask = 1; } else if (event->attr.config == PERF_COUNT_HW_INSTRUCTIONS) { cflags |= SBI_PMU_CFG_FLAG_SKIP_MATCH; - cmask = 1UL << (CSR_INSTRET - CSR_CYCLE); + cmask = BIT(CSR_INSTRET - CSR_CYCLE); } } @@ -506,24 +511,126 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) return ret; } +static void pmu_sbi_snapshot_free(struct riscv_pmu *pmu) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct cpu_hw_events *cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu); + + if (!cpu_hw_evt->snapshot_addr) + continue; + + free_page((unsigned long)cpu_hw_evt->snapshot_addr); + cpu_hw_evt->snapshot_addr = NULL; + cpu_hw_evt->snapshot_addr_phys = 0; + } +} + +static int pmu_sbi_snapshot_alloc(struct riscv_pmu *pmu) +{ + int cpu; + struct page *snapshot_page; + + for_each_possible_cpu(cpu) { + struct cpu_hw_events *cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu); + + snapshot_page = alloc_page(GFP_ATOMIC | __GFP_ZERO); + if (!snapshot_page) { + pmu_sbi_snapshot_free(pmu); + return -ENOMEM; + } + cpu_hw_evt->snapshot_addr = page_to_virt(snapshot_page); + cpu_hw_evt->snapshot_addr_phys = page_to_phys(snapshot_page); + } + + return 0; +} + +static int pmu_sbi_snapshot_disable(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, SBI_SHMEM_DISABLE, + SBI_SHMEM_DISABLE, 0, 0, 0, 0); + if (ret.error) { + pr_warn("failed to disable snapshot shared memory\n"); + return sbi_err_map_linux_errno(ret.error); + } + + return 0; +} + +static int pmu_sbi_snapshot_setup(struct riscv_pmu *pmu, int cpu) +{ + struct cpu_hw_events *cpu_hw_evt; + struct sbiret ret = {0}; + + cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu); + if (!cpu_hw_evt->snapshot_addr_phys) + return -EINVAL; + + if (cpu_hw_evt->snapshot_set_done) + return 0; + + if (IS_ENABLED(CONFIG_32BIT)) + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, + cpu_hw_evt->snapshot_addr_phys, + (u64)(cpu_hw_evt->snapshot_addr_phys) >> 32, 0, 0, 0, 0); + else + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, + cpu_hw_evt->snapshot_addr_phys, 0, 0, 0, 0, 0); + + /* Free up the snapshot area memory and fall back to SBI PMU calls without snapshot */ + if (ret.error) { + if (ret.error != SBI_ERR_NOT_SUPPORTED) + pr_warn("pmu snapshot setup failed with error %ld\n", ret.error); + return sbi_err_map_linux_errno(ret.error); + } + + memset(cpu_hw_evt->snapshot_cval_shcopy, 0, sizeof(u64) * RISCV_MAX_COUNTERS); + cpu_hw_evt->snapshot_set_done = true; + + return 0; +} + static u64 pmu_sbi_ctr_read(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; struct sbiret ret; - union sbi_pmu_ctr_info info; u64 val = 0; + struct riscv_pmu *pmu = to_riscv_pmu(event->pmu); + struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events); + struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr; + union sbi_pmu_ctr_info info = pmu_ctr_list[idx]; + + /* Read the value from the shared memory directly only if counter is stopped */ + if (sbi_pmu_snapshot_available() && (hwc->state & PERF_HES_STOPPED)) { + val = sdata->ctr_values[idx]; + return val; + } if (pmu_sbi_is_fw_event(event)) { ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ, hwc->idx, 0, 0, 0, 0, 0); - if (!ret.error) - val = ret.value; + if (ret.error) + return 0; + + val = ret.value; + if (IS_ENABLED(CONFIG_32BIT) && sbi_v2_available && info.width >= 32) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ_HI, + hwc->idx, 0, 0, 0, 0, 0); + if (!ret.error) + val |= ((u64)ret.value << 32); + else + WARN_ONCE(1, "Unable to read upper 32 bits of firmware counter error: %ld\n", + ret.error); + } } else { - info = pmu_ctr_list[idx]; val = riscv_pmu_ctr_read_csr(info.csr); if (IS_ENABLED(CONFIG_32BIT)) - val = ((u64)riscv_pmu_ctr_read_csr(info.csr + 0x80)) << 31 | val; + val |= ((u64)riscv_pmu_ctr_read_csr(info.csr + 0x80)) << 32; } return val; @@ -553,6 +660,7 @@ static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival) struct hw_perf_event *hwc = &event->hw; unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE; + /* There is no benefit setting SNAPSHOT FLAG for a single counter */ #if defined(CONFIG_32BIT) ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, hwc->idx, 1, flag, ival, ival >> 32, 0); @@ -573,16 +681,36 @@ static void pmu_sbi_ctr_stop(struct perf_event *event, unsigned long flag) { struct sbiret ret; struct hw_perf_event *hwc = &event->hw; + struct riscv_pmu *pmu = to_riscv_pmu(event->pmu); + struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events); + struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr; if ((hwc->flags & PERF_EVENT_FLAG_USER_ACCESS) && (hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) pmu_sbi_reset_scounteren((void *)event); + if (sbi_pmu_snapshot_available()) + flag |= SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT; + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, hwc->idx, 1, flag, 0, 0, 0); - if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) && - flag != SBI_PMU_STOP_FLAG_RESET) + if (!ret.error && sbi_pmu_snapshot_available()) { + /* + * The counter snapshot is based on the index base specified by hwc->idx. + * The actual counter value is updated in shared memory at index 0 when counter + * mask is 0x01. To ensure accurate counter values, it's necessary to transfer + * the counter value to shared memory. However, if hwc->idx is zero, the counter + * value is already correctly updated in shared memory, requiring no further + * adjustment. + */ + if (hwc->idx > 0) { + sdata->ctr_values[hwc->idx] = sdata->ctr_values[0]; + sdata->ctr_values[0] = 0; + } + } else if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) && + flag != SBI_PMU_STOP_FLAG_RESET) { pr_err("Stopping counter idx %d failed with error %d\n", hwc->idx, sbi_err_map_linux_errno(ret.error)); + } } static int pmu_sbi_find_num_ctrs(void) @@ -640,10 +768,39 @@ static inline void pmu_sbi_stop_all(struct riscv_pmu *pmu) static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu) { struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events); + struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr; + unsigned long flag = 0; + int i, idx; + struct sbiret ret; + u64 temp_ctr_overflow_mask = 0; + + if (sbi_pmu_snapshot_available()) + flag = SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT; + + /* Reset the shadow copy to avoid save/restore any value from previous overflow */ + memset(cpu_hw_evt->snapshot_cval_shcopy, 0, sizeof(u64) * RISCV_MAX_COUNTERS); + + for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++) { + /* No need to check the error here as we can't do anything about the error */ + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, i * BITS_PER_LONG, + cpu_hw_evt->used_hw_ctrs[i], flag, 0, 0, 0); + if (!ret.error && sbi_pmu_snapshot_available()) { + /* Save the counter values to avoid clobbering */ + for_each_set_bit(idx, &cpu_hw_evt->used_hw_ctrs[i], BITS_PER_LONG) + cpu_hw_evt->snapshot_cval_shcopy[i * BITS_PER_LONG + idx] = + sdata->ctr_values[idx]; + /* Save the overflow mask to avoid clobbering */ + temp_ctr_overflow_mask |= sdata->ctr_overflow_mask << (i * BITS_PER_LONG); + } + } - /* No need to check the error here as we can't do anything about the error */ - sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, 0, - cpu_hw_evt->used_hw_ctrs[0], 0, 0, 0, 0); + /* Restore the counter values to the shared memory for used hw counters */ + if (sbi_pmu_snapshot_available()) { + for_each_set_bit(idx, cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS) + sdata->ctr_values[idx] = cpu_hw_evt->snapshot_cval_shcopy[idx]; + if (temp_ctr_overflow_mask) + sdata->ctr_overflow_mask = temp_ctr_overflow_mask; + } } /* @@ -652,11 +809,10 @@ static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu) * while the overflowed counters need to be started with updated initialization * value. */ -static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu, - unsigned long ctr_ovf_mask) +static inline void pmu_sbi_start_ovf_ctrs_sbi(struct cpu_hw_events *cpu_hw_evt, + u64 ctr_ovf_mask) { - int idx = 0; - struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events); + int idx = 0, i; struct perf_event *event; unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE; unsigned long ctr_start_mask = 0; @@ -664,11 +820,12 @@ static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu, struct hw_perf_event *hwc; u64 init_val = 0; - ctr_start_mask = cpu_hw_evt->used_hw_ctrs[0] & ~ctr_ovf_mask; - - /* Start all the counters that did not overflow in a single shot */ - sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, 0, ctr_start_mask, - 0, 0, 0, 0); + for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++) { + ctr_start_mask = cpu_hw_evt->used_hw_ctrs[i] & ~ctr_ovf_mask; + /* Start all the counters that did not overflow in a single shot */ + sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, i * BITS_PER_LONG, ctr_start_mask, + 0, 0, 0, 0); + } /* Reinitialize and start all the counter that overflowed */ while (ctr_ovf_mask) { @@ -691,6 +848,52 @@ static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu, } } +static inline void pmu_sbi_start_ovf_ctrs_snapshot(struct cpu_hw_events *cpu_hw_evt, + u64 ctr_ovf_mask) +{ + int i, idx = 0; + struct perf_event *event; + unsigned long flag = SBI_PMU_START_FLAG_INIT_SNAPSHOT; + u64 max_period, init_val = 0; + struct hw_perf_event *hwc; + struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr; + + for_each_set_bit(idx, cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS) { + if (ctr_ovf_mask & BIT(idx)) { + event = cpu_hw_evt->events[idx]; + hwc = &event->hw; + max_period = riscv_pmu_ctr_get_width_mask(event); + init_val = local64_read(&hwc->prev_count) & max_period; + cpu_hw_evt->snapshot_cval_shcopy[idx] = init_val; + } + /* + * We do not need to update the non-overflow counters the previous + * value should have been there already. + */ + } + + for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++) { + /* Restore the counter values to relative indices for used hw counters */ + for_each_set_bit(idx, &cpu_hw_evt->used_hw_ctrs[i], BITS_PER_LONG) + sdata->ctr_values[idx] = + cpu_hw_evt->snapshot_cval_shcopy[idx + i * BITS_PER_LONG]; + /* Start all the counters in a single shot */ + sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, idx * BITS_PER_LONG, + cpu_hw_evt->used_hw_ctrs[i], flag, 0, 0, 0); + } +} + +static void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu, + u64 ctr_ovf_mask) +{ + struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events); + + if (sbi_pmu_snapshot_available()) + pmu_sbi_start_ovf_ctrs_snapshot(cpu_hw_evt, ctr_ovf_mask); + else + pmu_sbi_start_ovf_ctrs_sbi(cpu_hw_evt, ctr_ovf_mask); +} + static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) { struct perf_sample_data data; @@ -700,10 +903,11 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) int lidx, hidx, fidx; struct riscv_pmu *pmu; struct perf_event *event; - unsigned long overflow; - unsigned long overflowed_ctrs = 0; + u64 overflow; + u64 overflowed_ctrs = 0; struct cpu_hw_events *cpu_hw_evt = dev; u64 start_clock = sched_clock(); + struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr; if (WARN_ON_ONCE(!cpu_hw_evt)) return IRQ_NONE; @@ -725,7 +929,10 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) pmu_sbi_stop_hw_ctrs(pmu); /* Overflow status register should only be read after counter are stopped */ - ALT_SBI_PMU_OVERFLOW(overflow); + if (sbi_pmu_snapshot_available()) + overflow = sdata->ctr_overflow_mask; + else + ALT_SBI_PMU_OVERFLOW(overflow); /* * Overflow interrupt pending bit should only be cleared after stopping @@ -751,9 +958,14 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) if (!info || info->type != SBI_PMU_CTR_TYPE_HW) continue; - /* compute hardware counter index */ - hidx = info->csr - CSR_CYCLE; - /* check if the corresponding bit is set in sscountovf */ + if (sbi_pmu_snapshot_available()) + /* SBI implementation already updated the logical indicies */ + hidx = lidx; + else + /* compute hardware counter index */ + hidx = info->csr - CSR_CYCLE; + + /* check if the corresponding bit is set in sscountovf or overflow mask in shmem */ if (!(overflow & BIT(hidx))) continue; @@ -763,7 +975,10 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) */ overflowed_ctrs |= BIT(lidx); hw_evt = &event->hw; + /* Update the event states here so that we know the state while reading */ + hw_evt->state |= PERF_HES_STOPPED; riscv_pmu_event_update(event); + hw_evt->state |= PERF_HES_UPTODATE; perf_sample_data_init(&data, 0, hw_evt->last_period); if (riscv_pmu_event_set_period(event)) { /* @@ -776,6 +991,8 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) */ perf_event_overflow(event, &data, regs); } + /* Reset the state as we are going to start the counter after the loop */ + hw_evt->state = 0; } pmu_sbi_start_overflow_mask(pmu, overflowed_ctrs); @@ -807,6 +1024,9 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node) enable_percpu_irq(riscv_pmu_irq, IRQ_TYPE_NONE); } + if (sbi_pmu_snapshot_available()) + return pmu_sbi_snapshot_setup(pmu, cpu); + return 0; } @@ -819,6 +1039,9 @@ static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node) /* Disable all counters access for user mode now */ csr_write(CSR_SCOUNTEREN, 0x0); + if (sbi_pmu_snapshot_available()) + return pmu_sbi_snapshot_disable(); + return 0; } @@ -927,6 +1150,12 @@ static inline void riscv_pm_pmu_unregister(struct riscv_pmu *pmu) { } static void riscv_pmu_destroy(struct riscv_pmu *pmu) { + if (sbi_v2_available) { + if (sbi_pmu_snapshot_available()) { + pmu_sbi_snapshot_disable(); + pmu_sbi_snapshot_free(pmu); + } + } riscv_pm_pmu_unregister(pmu); cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node); } @@ -1094,10 +1323,6 @@ static int pmu_sbi_device_probe(struct platform_device *pdev) pmu->event_unmapped = pmu_sbi_event_unmapped; pmu->csr_index = pmu_sbi_csr_index; - ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node); - if (ret) - return ret; - ret = riscv_pm_pmu_register(pmu); if (ret) goto out_unregister; @@ -1106,8 +1331,34 @@ static int pmu_sbi_device_probe(struct platform_device *pdev) if (ret) goto out_unregister; + /* SBI PMU Snapsphot is only available in SBI v2.0 */ + if (sbi_v2_available) { + ret = pmu_sbi_snapshot_alloc(pmu); + if (ret) + goto out_unregister; + + ret = pmu_sbi_snapshot_setup(pmu, smp_processor_id()); + if (ret) { + /* Snapshot is an optional feature. Continue if not available */ + pmu_sbi_snapshot_free(pmu); + } else { + pr_info("SBI PMU snapshot detected\n"); + /* + * We enable it once here for the boot cpu. If snapshot shmem setup + * fails during cpu hotplug process, it will fail to start the cpu + * as we can not handle hetergenous PMUs with different snapshot + * capability. + */ + static_branch_enable(&sbi_pmu_snapshot_available); + } + } + register_sysctl("kernel", sbi_pmu_sysctl_table); + ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node); + if (ret) + goto out_unregister; + return 0; out_unregister: @@ -1135,6 +1386,9 @@ static int __init pmu_sbi_devinit(void) return 0; } + if (sbi_spec_version >= sbi_mk_version(2, 0)) + sbi_v2_available = true; + ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_RISCV_STARTING, "perf/riscv/pmu:starting", pmu_sbi_starting_cpu, pmu_sbi_dying_cpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 48f31dcd318a..afbc99264ffa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -259,7 +259,6 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { - pte_t pte; unsigned long attributes; }; @@ -273,7 +272,6 @@ struct kvm_gfn_range { bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); #endif enum { diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index d93f6522b2c3..827ecc0b7e10 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -86,6 +86,7 @@ struct gfn_to_pfn_cache { struct kvm_mmu_memory_cache { gfp_t gfp_zero; gfp_t gfp_custom; + u64 init_value; struct kmem_cache *kmem_cache; int capacity; int nobjs; diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index f349e08a9dfe..d39ebb10caeb 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -123,15 +123,6 @@ struct mmu_notifier_ops { unsigned long address); /* - * change_pte is called in cases that pte mapping to page is changed: - * for example, when ksm remaps pte to point to a new shared page. - */ - void (*change_pte)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long address, - pte_t pte); - - /* * invalidate_range_start() and invalidate_range_end() must be * paired and are called only when the mmap_lock and/or the * locks protecting the reverse maps are held. If the subsystem @@ -392,8 +383,6 @@ extern int __mmu_notifier_clear_young(struct mm_struct *mm, unsigned long end); extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); -extern void __mmu_notifier_change_pte(struct mm_struct *mm, - unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, @@ -439,13 +428,6 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm, return 0; } -static inline void mmu_notifier_change_pte(struct mm_struct *mm, - unsigned long address, pte_t pte) -{ - if (mm_has_notifiers(mm)) - __mmu_notifier_change_pte(mm, address, pte); -} - static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { @@ -581,26 +563,6 @@ static inline void mmu_notifier_range_init_owner( __young; \ }) -/* - * set_pte_at_notify() sets the pte _after_ running the notifier. - * This is safe to start by updating the secondary MMUs, because the primary MMU - * pte invalidate must have already happened with a ptep_clear_flush() before - * set_pte_at_notify() has been invoked. Updating the secondary MMUs first is - * required when we change both the protection of the mapping from read-only to - * read-write and the pfn (like during copy on write page faults). Otherwise the - * old page would remain mapped readonly in the secondary MMUs after the new - * page is already writable by some CPU through the primary MMU. - */ -#define set_pte_at_notify(__mm, __address, __ptep, __pte) \ -({ \ - struct mm_struct *___mm = __mm; \ - unsigned long ___address = __address; \ - pte_t ___pte = __pte; \ - \ - mmu_notifier_change_pte(___mm, ___address, ___pte); \ - set_pte_at(___mm, ___address, __ptep, ___pte); \ -}) - #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { @@ -650,11 +612,6 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm, return 0; } -static inline void mmu_notifier_change_pte(struct mm_struct *mm, - unsigned long address, pte_t pte) -{ -} - static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { @@ -693,7 +650,6 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush #define pudp_huge_clear_flush_notify pudp_huge_clear_flush -#define set_pte_at_notify set_pte_at static inline void mmu_notifier_synchronize(void) { diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h index 43282e22ebe1..701974639ff2 100644 --- a/include/linux/perf/riscv_pmu.h +++ b/include/linux/perf/riscv_pmu.h @@ -39,6 +39,14 @@ struct cpu_hw_events { DECLARE_BITMAP(used_hw_ctrs, RISCV_MAX_COUNTERS); /* currently enabled firmware counters */ DECLARE_BITMAP(used_fw_ctrs, RISCV_MAX_COUNTERS); + /* The virtual address of the shared memory where counter snapshot will be taken */ + void *snapshot_addr; + /* The physical address of the shared memory where counter snapshot will be taken */ + phys_addr_t snapshot_addr_phys; + /* Boolean flag to indicate setup is already done */ + bool snapshot_set_done; + /* A shadow copy of the counter values to avoid clobbering during multiple SBI calls */ + u64 snapshot_cval_shcopy[RISCV_MAX_COUNTERS]; }; struct riscv_pmu { diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 011fba6b5552..74e40d5d4af4 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -456,21 +456,6 @@ TRACE_EVENT(kvm_unmap_hva_range, __entry->start, __entry->end) ); -TRACE_EVENT(kvm_set_spte_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva) -); - TRACE_EVENT(kvm_age_hva, TP_PROTO(unsigned long start, unsigned long end), TP_ARGS(start, end), diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e4834d23e1d1..1215bc299390 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -18,7 +18,7 @@ #include <linux/sched/coredump.h> #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ -#include <linux/mmu_notifier.h> /* set_pte_at_notify */ +#include <linux/mmu_notifier.h> #include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ @@ -195,8 +195,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + set_pte_at(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); folio_remove_rmap_pte(old_folio, old_page, vma); if (!folio_mapped(old_folio)) @@ -1345,7 +1345,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (pte_write(entry)) entry = pte_wrprotect(entry); - set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); + set_pte_at(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = entry; err = 0; @@ -1447,7 +1447,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, newpte); + set_pte_at(mm, addr, ptep, newpte); folio = page_folio(page); folio_remove_rmap_pte(folio, page, vma); diff --git a/mm/memory.c b/mm/memory.c index d2155ced45f8..0201f50d8307 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3329,13 +3329,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address); folio_add_lru_vma(new_folio, vma); - /* - * We call the notify macro here because, when using secondary - * mmu page tables (such as kvm shadow page tables), we want the - * new page to be mapped directly into the secondary page table. - */ BUG_ON(unshare && pte_write(entry)); - set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + set_pte_at(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* diff --git a/mm/migrate_device.c b/mm/migrate_device.c index b6c27c76e1a0..66206734b1b9 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -664,13 +664,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (flush) { flush_cache_page(vma, addr, pte_pfn(orig_pte)); ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); - } else { - /* No need to invalidate - it was non-present before */ - set_pte_at(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); } + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); pte_unmap_unlock(ptep, ptl); *src = MIGRATE_PFN_MIGRATE; diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index ec3b068cbbe6..8982e6139d07 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -424,23 +424,6 @@ int __mmu_notifier_test_young(struct mm_struct *mm, return young; } -void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, - pte_t pte) -{ - struct mmu_notifier *subscription; - int id; - - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist, - srcu_read_lock_held(&srcu)) { - if (subscription->ops->change_pte) - subscription->ops->change_pte(subscription, mm, address, - pte); - } - srcu_read_unlock(&srcu, id); -} - static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range) { diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 741c7dc16afc..266c0fe85fef 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -120,6 +120,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test +TEST_GEN_PROGS_x86_64 += x86_64/sev_init2_tests TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests TEST_GEN_PROGS_x86_64 += x86_64/sev_smoke_test TEST_GEN_PROGS_x86_64 += x86_64/amx_test @@ -189,6 +190,8 @@ TEST_GEN_PROGS_s390x += rseq_test TEST_GEN_PROGS_s390x += set_memory_region_test TEST_GEN_PROGS_s390x += kvm_binary_stats_test +TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test +TEST_GEN_PROGS_riscv += riscv/ebreak_test TEST_GEN_PROGS_riscv += arch_timer TEST_GEN_PROGS_riscv += demand_paging_test TEST_GEN_PROGS_riscv += dirty_log_test diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 3e0db283a46a..8acca8237687 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -93,7 +93,6 @@ enum kvm_mem_region_type { struct kvm_vm { int mode; unsigned long type; - uint8_t subtype; int kvm_fd; int fd; unsigned int pgtable_levels; @@ -200,8 +199,8 @@ enum vm_guest_mode { struct vm_shape { uint32_t type; uint8_t mode; - uint8_t subtype; - uint16_t padding; + uint8_t pad0; + uint16_t pad1; }; kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); @@ -890,17 +889,15 @@ static inline struct kvm_vm *vm_create_barebones(void) return ____vm_create(VM_SHAPE_DEFAULT); } -#ifdef __x86_64__ -static inline struct kvm_vm *vm_create_barebones_protected_vm(void) +static inline struct kvm_vm *vm_create_barebones_type(unsigned long type) { const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, - .type = KVM_X86_SW_PROTECTED_VM, + .type = type, }; return ____vm_create(shape); } -#endif static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) { diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h index ce473fe251dd..5f389166338c 100644 --- a/tools/testing/selftests/kvm/include/riscv/processor.h +++ b/tools/testing/selftests/kvm/include/riscv/processor.h @@ -50,6 +50,16 @@ static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t subtype, bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext); +static inline bool __vcpu_has_isa_ext(struct kvm_vcpu *vcpu, uint64_t isa_ext) +{ + return __vcpu_has_ext(vcpu, RISCV_ISA_EXT_REG(isa_ext)); +} + +static inline bool __vcpu_has_sbi_ext(struct kvm_vcpu *vcpu, uint64_t sbi_ext) +{ + return __vcpu_has_ext(vcpu, RISCV_SBI_EXT_REG(sbi_ext)); +} + struct ex_regs { unsigned long ra; unsigned long sp; @@ -154,45 +164,6 @@ void vm_install_interrupt_handler(struct kvm_vm *vm, exception_handler_fn handle #define PGTBL_PAGE_SIZE PGTBL_L0_BLOCK_SIZE #define PGTBL_PAGE_SIZE_SHIFT PGTBL_L0_BLOCK_SHIFT -/* SBI return error codes */ -#define SBI_SUCCESS 0 -#define SBI_ERR_FAILURE -1 -#define SBI_ERR_NOT_SUPPORTED -2 -#define SBI_ERR_INVALID_PARAM -3 -#define SBI_ERR_DENIED -4 -#define SBI_ERR_INVALID_ADDRESS -5 -#define SBI_ERR_ALREADY_AVAILABLE -6 -#define SBI_ERR_ALREADY_STARTED -7 -#define SBI_ERR_ALREADY_STOPPED -8 - -#define SBI_EXT_EXPERIMENTAL_START 0x08000000 -#define SBI_EXT_EXPERIMENTAL_END 0x08FFFFFF - -#define KVM_RISCV_SELFTESTS_SBI_EXT SBI_EXT_EXPERIMENTAL_END -#define KVM_RISCV_SELFTESTS_SBI_UCALL 0 -#define KVM_RISCV_SELFTESTS_SBI_UNEXP 1 - -enum sbi_ext_id { - SBI_EXT_BASE = 0x10, - SBI_EXT_STA = 0x535441, -}; - -enum sbi_ext_base_fid { - SBI_EXT_BASE_PROBE_EXT = 3, -}; - -struct sbiret { - long error; - long value; -}; - -struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, - unsigned long arg1, unsigned long arg2, - unsigned long arg3, unsigned long arg4, - unsigned long arg5); - -bool guest_sbi_probe_extension(int extid, long *out_val); - static inline void local_irq_enable(void) { csr_set(CSR_SSTATUS, SR_SIE); diff --git a/tools/testing/selftests/kvm/include/riscv/sbi.h b/tools/testing/selftests/kvm/include/riscv/sbi.h new file mode 100644 index 000000000000..046b432ae896 --- /dev/null +++ b/tools/testing/selftests/kvm/include/riscv/sbi.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * RISC-V SBI specific definitions + * + * Copyright (C) 2024 Rivos Inc. + */ + +#ifndef SELFTEST_KVM_SBI_H +#define SELFTEST_KVM_SBI_H + +/* SBI spec version fields */ +#define SBI_SPEC_VERSION_DEFAULT 0x1 +#define SBI_SPEC_VERSION_MAJOR_SHIFT 24 +#define SBI_SPEC_VERSION_MAJOR_MASK 0x7f +#define SBI_SPEC_VERSION_MINOR_MASK 0xffffff + +/* SBI return error codes */ +#define SBI_SUCCESS 0 +#define SBI_ERR_FAILURE -1 +#define SBI_ERR_NOT_SUPPORTED -2 +#define SBI_ERR_INVALID_PARAM -3 +#define SBI_ERR_DENIED -4 +#define SBI_ERR_INVALID_ADDRESS -5 +#define SBI_ERR_ALREADY_AVAILABLE -6 +#define SBI_ERR_ALREADY_STARTED -7 +#define SBI_ERR_ALREADY_STOPPED -8 + +#define SBI_EXT_EXPERIMENTAL_START 0x08000000 +#define SBI_EXT_EXPERIMENTAL_END 0x08FFFFFF + +#define KVM_RISCV_SELFTESTS_SBI_EXT SBI_EXT_EXPERIMENTAL_END +#define KVM_RISCV_SELFTESTS_SBI_UCALL 0 +#define KVM_RISCV_SELFTESTS_SBI_UNEXP 1 + +enum sbi_ext_id { + SBI_EXT_BASE = 0x10, + SBI_EXT_STA = 0x535441, + SBI_EXT_PMU = 0x504D55, +}; + +enum sbi_ext_base_fid { + SBI_EXT_BASE_GET_SPEC_VERSION = 0, + SBI_EXT_BASE_GET_IMP_ID, + SBI_EXT_BASE_GET_IMP_VERSION, + SBI_EXT_BASE_PROBE_EXT = 3, +}; +enum sbi_ext_pmu_fid { + SBI_EXT_PMU_NUM_COUNTERS = 0, + SBI_EXT_PMU_COUNTER_GET_INFO, + SBI_EXT_PMU_COUNTER_CFG_MATCH, + SBI_EXT_PMU_COUNTER_START, + SBI_EXT_PMU_COUNTER_STOP, + SBI_EXT_PMU_COUNTER_FW_READ, + SBI_EXT_PMU_COUNTER_FW_READ_HI, + SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, +}; + +union sbi_pmu_ctr_info { + unsigned long value; + struct { + unsigned long csr:12; + unsigned long width:6; +#if __riscv_xlen == 32 + unsigned long reserved:13; +#else + unsigned long reserved:45; +#endif + unsigned long type:1; + }; +}; + +struct riscv_pmu_snapshot_data { + u64 ctr_overflow_mask; + u64 ctr_values[64]; + u64 reserved[447]; +}; + +struct sbiret { + long error; + long value; +}; + +/** General pmu event codes specified in SBI PMU extension */ +enum sbi_pmu_hw_generic_events_t { + SBI_PMU_HW_NO_EVENT = 0, + SBI_PMU_HW_CPU_CYCLES = 1, + SBI_PMU_HW_INSTRUCTIONS = 2, + SBI_PMU_HW_CACHE_REFERENCES = 3, + SBI_PMU_HW_CACHE_MISSES = 4, + SBI_PMU_HW_BRANCH_INSTRUCTIONS = 5, + SBI_PMU_HW_BRANCH_MISSES = 6, + SBI_PMU_HW_BUS_CYCLES = 7, + SBI_PMU_HW_STALLED_CYCLES_FRONTEND = 8, + SBI_PMU_HW_STALLED_CYCLES_BACKEND = 9, + SBI_PMU_HW_REF_CPU_CYCLES = 10, + + SBI_PMU_HW_GENERAL_MAX, +}; + +/* SBI PMU counter types */ +enum sbi_pmu_ctr_type { + SBI_PMU_CTR_TYPE_HW = 0x0, + SBI_PMU_CTR_TYPE_FW, +}; + +/* Flags defined for config matching function */ +#define SBI_PMU_CFG_FLAG_SKIP_MATCH BIT(0) +#define SBI_PMU_CFG_FLAG_CLEAR_VALUE BIT(1) +#define SBI_PMU_CFG_FLAG_AUTO_START BIT(2) +#define SBI_PMU_CFG_FLAG_SET_VUINH BIT(3) +#define SBI_PMU_CFG_FLAG_SET_VSINH BIT(4) +#define SBI_PMU_CFG_FLAG_SET_UINH BIT(5) +#define SBI_PMU_CFG_FLAG_SET_SINH BIT(6) +#define SBI_PMU_CFG_FLAG_SET_MINH BIT(7) + +/* Flags defined for counter start function */ +#define SBI_PMU_START_FLAG_SET_INIT_VALUE BIT(0) +#define SBI_PMU_START_FLAG_INIT_SNAPSHOT BIT(1) + +/* Flags defined for counter stop function */ +#define SBI_PMU_STOP_FLAG_RESET BIT(0) +#define SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT BIT(1) + +struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5); + +bool guest_sbi_probe_extension(int extid, long *out_val); + +/* Make SBI version */ +static inline unsigned long sbi_mk_version(unsigned long major, + unsigned long minor) +{ + return ((major & SBI_SPEC_VERSION_MAJOR_MASK) << SBI_SPEC_VERSION_MAJOR_SHIFT) + | (minor & SBI_SPEC_VERSION_MINOR_MASK); +} + +unsigned long get_host_sbi_spec_version(void); + +#endif /* SELFTEST_KVM_SBI_H */ diff --git a/tools/testing/selftests/kvm/include/riscv/ucall.h b/tools/testing/selftests/kvm/include/riscv/ucall.h index be46eb32ec27..a695ae36f3e0 100644 --- a/tools/testing/selftests/kvm/include/riscv/ucall.h +++ b/tools/testing/selftests/kvm/include/riscv/ucall.h @@ -3,6 +3,7 @@ #define SELFTEST_KVM_UCALL_H #include "processor.h" +#include "sbi.h" #define UCALL_EXIT_REASON KVM_EXIT_RISCV_SBI diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 81ce37ec407d..74a59c7ce7ed 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -23,12 +23,6 @@ extern bool host_cpu_is_intel; extern bool host_cpu_is_amd; -enum vm_guest_x86_subtype { - VM_SUBTYPE_NONE = 0, - VM_SUBTYPE_SEV, - VM_SUBTYPE_SEV_ES, -}; - /* Forced emulation prefix, used to invoke the emulator unconditionally. */ #define KVM_FEP "ud2; .byte 'k', 'v', 'm';" diff --git a/tools/testing/selftests/kvm/include/x86_64/sev.h b/tools/testing/selftests/kvm/include/x86_64/sev.h index 8a1bf88474c9..82c11c81a956 100644 --- a/tools/testing/selftests/kvm/include/x86_64/sev.h +++ b/tools/testing/selftests/kvm/include/x86_64/sev.h @@ -31,8 +31,9 @@ void sev_vm_launch(struct kvm_vm *vm, uint32_t policy); void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement); void sev_vm_launch_finish(struct kvm_vm *vm); -struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code, +struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, struct kvm_vcpu **cpu); +void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement); kvm_static_assert(SEV_RET_SUCCESS == 0); @@ -67,20 +68,8 @@ kvm_static_assert(SEV_RET_SUCCESS == 0); __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ }) -static inline void sev_vm_init(struct kvm_vm *vm) -{ - vm->arch.sev_fd = open_sev_dev_path_or_exit(); - - vm_sev_ioctl(vm, KVM_SEV_INIT, NULL); -} - - -static inline void sev_es_vm_init(struct kvm_vm *vm) -{ - vm->arch.sev_fd = open_sev_dev_path_or_exit(); - - vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL); -} +void sev_vm_init(struct kvm_vm *vm); +void sev_es_vm_init(struct kvm_vm *vm); static inline void sev_register_encrypted_memory(struct kvm_vm *vm, struct userspace_mem_region *region) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index b2262b5fad9e..9da388100f3a 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -276,7 +276,6 @@ struct kvm_vm *____vm_create(struct vm_shape shape) vm->mode = shape.mode; vm->type = shape.type; - vm->subtype = shape.subtype; vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits; vm->va_bits = vm_guest_mode_params[vm->mode].va_bits; diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index e8211f5d6863..ccb35573749c 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -502,3 +502,15 @@ bool guest_sbi_probe_extension(int extid, long *out_val) return true; } + +unsigned long get_host_sbi_spec_version(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_BASE, SBI_EXT_BASE_GET_SPEC_VERSION, 0, + 0, 0, 0, 0, 0); + + GUEST_ASSERT(!ret.error); + + return ret.value; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 74a4c736c9ae..9f87ca8b7ab6 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -578,10 +578,11 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm) sync_global_to_guest(vm, host_cpu_is_intel); sync_global_to_guest(vm, host_cpu_is_amd); - if (vm->subtype == VM_SUBTYPE_SEV) - sev_vm_init(vm); - else if (vm->subtype == VM_SUBTYPE_SEV_ES) - sev_es_vm_init(vm); + if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + struct kvm_sev_init init = { 0 }; + + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } } void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) @@ -1081,9 +1082,12 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) void kvm_init_vm_address_properties(struct kvm_vm *vm) { - if (vm->subtype == VM_SUBTYPE_SEV || vm->subtype == VM_SUBTYPE_SEV_ES) { + if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + vm->arch.sev_fd = open_sev_dev_path_or_exit(); vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT)); vm->gpa_tag_mask = vm->arch.c_bit; + } else { + vm->arch.sev_fd = -1; } } diff --git a/tools/testing/selftests/kvm/lib/x86_64/sev.c b/tools/testing/selftests/kvm/lib/x86_64/sev.c index e248d3364b9c..d482029b6004 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/sev.c +++ b/tools/testing/selftests/kvm/lib/x86_64/sev.c @@ -35,6 +35,32 @@ static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *regio } } +void sev_vm_init(struct kvm_vm *vm) +{ + if (vm->type == KVM_X86_DEFAULT_VM) { + assert(vm->arch.sev_fd == -1); + vm->arch.sev_fd = open_sev_dev_path_or_exit(); + vm_sev_ioctl(vm, KVM_SEV_INIT, NULL); + } else { + struct kvm_sev_init init = { 0 }; + assert(vm->type == KVM_X86_SEV_VM); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } +} + +void sev_es_vm_init(struct kvm_vm *vm) +{ + if (vm->type == KVM_X86_DEFAULT_VM) { + assert(vm->arch.sev_fd == -1); + vm->arch.sev_fd = open_sev_dev_path_or_exit(); + vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL); + } else { + struct kvm_sev_init init = { 0 }; + assert(vm->type == KVM_X86_SEV_ES_VM); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } +} + void sev_vm_launch(struct kvm_vm *vm, uint32_t policy) { struct kvm_sev_launch_start launch_start = { @@ -87,28 +113,30 @@ void sev_vm_launch_finish(struct kvm_vm *vm) TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING); } -struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code, +struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, struct kvm_vcpu **cpu) { struct vm_shape shape = { - .type = VM_TYPE_DEFAULT, .mode = VM_MODE_DEFAULT, - .subtype = policy & SEV_POLICY_ES ? VM_SUBTYPE_SEV_ES : - VM_SUBTYPE_SEV, + .type = type, }; struct kvm_vm *vm; struct kvm_vcpu *cpus[1]; - uint8_t measurement[512]; vm = __vm_create_with_vcpus(shape, 1, 0, guest_code, cpus); *cpu = cpus[0]; + return vm; +} + +void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement) +{ sev_vm_launch(vm, policy); - /* TODO: Validate the measurement is as expected. */ + if (!measurement) + measurement = alloca(256); + sev_vm_launch_measure(vm, measurement); sev_vm_launch_finish(vm); - - return vm; } diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c index 0f9cabd99fd4..735b78569021 100644 --- a/tools/testing/selftests/kvm/riscv/arch_timer.c +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c @@ -85,7 +85,7 @@ struct kvm_vm *test_vm_create(void) int nr_vcpus = test_args.nr_vcpus; vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); - __TEST_REQUIRE(__vcpu_has_ext(vcpus[0], RISCV_ISA_EXT_REG(KVM_RISCV_ISA_EXT_SSTC)), + __TEST_REQUIRE(__vcpu_has_isa_ext(vcpus[0], KVM_RISCV_ISA_EXT_SSTC), "SSTC not available, skipping test\n"); vm_init_vector_tables(vm); diff --git a/tools/testing/selftests/kvm/riscv/ebreak_test.c b/tools/testing/selftests/kvm/riscv/ebreak_test.c new file mode 100644 index 000000000000..823c132069b4 --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/ebreak_test.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V KVM ebreak test. + * + * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd. + * + */ +#include "kvm_util.h" + +#define LABEL_ADDRESS(v) ((uint64_t)&(v)) + +extern unsigned char sw_bp_1, sw_bp_2; +static uint64_t sw_bp_addr; + +static void guest_code(void) +{ + asm volatile( + ".option push\n" + ".option norvc\n" + "sw_bp_1: ebreak\n" + "sw_bp_2: ebreak\n" + ".option pop\n" + ); + GUEST_ASSERT_EQ(READ_ONCE(sw_bp_addr), LABEL_ADDRESS(sw_bp_2)); + + GUEST_DONE(); +} + +static void guest_breakpoint_handler(struct ex_regs *regs) +{ + WRITE_ONCE(sw_bp_addr, regs->epc); + regs->epc += 4; +} + +int main(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + uint64_t pc; + struct kvm_guest_debug debug = { + .control = KVM_GUESTDBG_ENABLE, + }; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_vector_tables(vm); + vcpu_init_vector_tables(vcpu); + vm_install_exception_handler(vm, EXC_BREAKPOINT, + guest_breakpoint_handler); + + /* + * Enable the guest debug. + * ebreak should exit to the VMM with KVM_EXIT_DEBUG reason. + */ + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG); + + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc), &pc); + TEST_ASSERT_EQ(pc, LABEL_ADDRESS(sw_bp_1)); + + /* skip sw_bp_1 */ + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), pc + 4); + + /* + * Disable all debug controls. + * Guest should handle the ebreak without exiting to the VMM. + */ + memset(&debug, 0, sizeof(debug)); + vcpu_guest_debug_set(vcpu, &debug); + + vcpu_run(vcpu); + + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index b882b7b9b785..222198dd6d04 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -43,6 +43,7 @@ bool filter_reg(__u64 reg) case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_V: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SMSTATEEN: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSAIA: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSCOFPMF: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSTC: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVINVAL: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVNAPOT: @@ -408,6 +409,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off) KVM_ISA_EXT_ARR(V), KVM_ISA_EXT_ARR(SMSTATEEN), KVM_ISA_EXT_ARR(SSAIA), + KVM_ISA_EXT_ARR(SSCOFPMF), KVM_ISA_EXT_ARR(SSTC), KVM_ISA_EXT_ARR(SVINVAL), KVM_ISA_EXT_ARR(SVNAPOT), @@ -931,6 +933,7 @@ KVM_ISA_EXT_SUBLIST_CONFIG(fp_f, FP_F); KVM_ISA_EXT_SUBLIST_CONFIG(fp_d, FP_D); KVM_ISA_EXT_SIMPLE_CONFIG(h, H); KVM_ISA_EXT_SUBLIST_CONFIG(smstateen, SMSTATEEN); +KVM_ISA_EXT_SIMPLE_CONFIG(sscofpmf, SSCOFPMF); KVM_ISA_EXT_SIMPLE_CONFIG(sstc, SSTC); KVM_ISA_EXT_SIMPLE_CONFIG(svinval, SVINVAL); KVM_ISA_EXT_SIMPLE_CONFIG(svnapot, SVNAPOT); @@ -986,6 +989,7 @@ struct vcpu_reg_list *vcpu_configs[] = { &config_fp_d, &config_h, &config_smstateen, + &config_sscofpmf, &config_sstc, &config_svinval, &config_svnapot, diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c new file mode 100644 index 000000000000..69bb94e6b227 --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -0,0 +1,681 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sbi_pmu_test.c - Tests the riscv64 SBI PMU functionality. + * + * Copyright (c) 2024, Rivos Inc. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include "kvm_util.h" +#include "test_util.h" +#include "processor.h" +#include "sbi.h" +#include "arch_timer.h" + +/* Maximum counters(firmware + hardware) */ +#define RISCV_MAX_PMU_COUNTERS 64 +union sbi_pmu_ctr_info ctrinfo_arr[RISCV_MAX_PMU_COUNTERS]; + +/* Snapshot shared memory data */ +#define PMU_SNAPSHOT_GPA_BASE BIT(30) +static void *snapshot_gva; +static vm_paddr_t snapshot_gpa; + +static int vcpu_shared_irq_count; +static int counter_in_use; + +/* Cache the available counters in a bitmask */ +static unsigned long counter_mask_available; + +static bool illegal_handler_invoked; + +#define SBI_PMU_TEST_BASIC BIT(0) +#define SBI_PMU_TEST_EVENTS BIT(1) +#define SBI_PMU_TEST_SNAPSHOT BIT(2) +#define SBI_PMU_TEST_OVERFLOW BIT(3) + +static int disabled_tests; + +unsigned long pmu_csr_read_num(int csr_num) +{ +#define switchcase_csr_read(__csr_num, __val) {\ + case __csr_num: \ + __val = csr_read(__csr_num); \ + break; } +#define switchcase_csr_read_2(__csr_num, __val) {\ + switchcase_csr_read(__csr_num + 0, __val) \ + switchcase_csr_read(__csr_num + 1, __val)} +#define switchcase_csr_read_4(__csr_num, __val) {\ + switchcase_csr_read_2(__csr_num + 0, __val) \ + switchcase_csr_read_2(__csr_num + 2, __val)} +#define switchcase_csr_read_8(__csr_num, __val) {\ + switchcase_csr_read_4(__csr_num + 0, __val) \ + switchcase_csr_read_4(__csr_num + 4, __val)} +#define switchcase_csr_read_16(__csr_num, __val) {\ + switchcase_csr_read_8(__csr_num + 0, __val) \ + switchcase_csr_read_8(__csr_num + 8, __val)} +#define switchcase_csr_read_32(__csr_num, __val) {\ + switchcase_csr_read_16(__csr_num + 0, __val) \ + switchcase_csr_read_16(__csr_num + 16, __val)} + + unsigned long ret = 0; + + switch (csr_num) { + switchcase_csr_read_32(CSR_CYCLE, ret) + switchcase_csr_read_32(CSR_CYCLEH, ret) + default : + break; + } + + return ret; +#undef switchcase_csr_read_32 +#undef switchcase_csr_read_16 +#undef switchcase_csr_read_8 +#undef switchcase_csr_read_4 +#undef switchcase_csr_read_2 +#undef switchcase_csr_read +} + +static inline void dummy_func_loop(uint64_t iter) +{ + int i = 0; + + while (i < iter) { + asm volatile("nop"); + i++; + } +} + +static void start_counter(unsigned long counter, unsigned long start_flags, + unsigned long ival) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, counter, 1, start_flags, + ival, 0, 0); + __GUEST_ASSERT(ret.error == 0, "Unable to start counter %ld\n", counter); +} + +/* This should be invoked only for reset counter use case */ +static void stop_reset_counter(unsigned long counter, unsigned long stop_flags) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, + stop_flags | SBI_PMU_STOP_FLAG_RESET, 0, 0, 0); + __GUEST_ASSERT(ret.error == SBI_ERR_ALREADY_STOPPED, + "Unable to stop counter %ld\n", counter); +} + +static void stop_counter(unsigned long counter, unsigned long stop_flags) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, stop_flags, + 0, 0, 0); + __GUEST_ASSERT(ret.error == 0, "Unable to stop counter %ld error %ld\n", + counter, ret.error); +} + +static void guest_illegal_exception_handler(struct ex_regs *regs) +{ + __GUEST_ASSERT(regs->cause == EXC_INST_ILLEGAL, + "Unexpected exception handler %lx\n", regs->cause); + + illegal_handler_invoked = true; + /* skip the trapping instruction */ + regs->epc += 4; +} + +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int irq_num = regs->cause & ~CAUSE_IRQ_FLAG; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + unsigned long overflown_mask; + unsigned long counter_val = 0; + + /* Validate that we are in the correct irq handler */ + GUEST_ASSERT_EQ(irq_num, IRQ_PMU_OVF); + + /* Stop all counters first to avoid further interrupts */ + stop_counter(counter_in_use, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + csr_clear(CSR_SIP, BIT(IRQ_PMU_OVF)); + + overflown_mask = READ_ONCE(snapshot_data->ctr_overflow_mask); + GUEST_ASSERT(overflown_mask & 0x01); + + WRITE_ONCE(vcpu_shared_irq_count, vcpu_shared_irq_count+1); + + counter_val = READ_ONCE(snapshot_data->ctr_values[0]); + /* Now start the counter to mimick the real driver behavior */ + start_counter(counter_in_use, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_val); +} + +static unsigned long get_counter_index(unsigned long cbase, unsigned long cmask, + unsigned long cflags, + unsigned long event) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase, cmask, + cflags, event, 0, 0); + __GUEST_ASSERT(ret.error == 0, "config matching failed %ld\n", ret.error); + GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS); + GUEST_ASSERT(BIT(ret.value) & counter_mask_available); + + return ret.value; +} + +static unsigned long get_num_counters(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_NUM_COUNTERS, 0, 0, 0, 0, 0, 0); + + __GUEST_ASSERT(ret.error == 0, "Unable to retrieve number of counters from SBI PMU"); + __GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS, + "Invalid number of counters %ld\n", ret.value); + + return ret.value; +} + +static void update_counter_info(int num_counters) +{ + int i = 0; + struct sbiret ret; + + for (i = 0; i < num_counters; i++) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, 0, 0, 0, 0, 0); + + /* There can be gaps in logical counter indicies*/ + if (ret.error) + continue; + GUEST_ASSERT_NE(ret.value, 0); + + ctrinfo_arr[i].value = ret.value; + counter_mask_available |= BIT(i); + } + + GUEST_ASSERT(counter_mask_available > 0); +} + +static unsigned long read_fw_counter(int idx, union sbi_pmu_ctr_info ctrinfo) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ, idx, 0, 0, 0, 0, 0); + GUEST_ASSERT(ret.error == 0); + return ret.value; +} + +static unsigned long read_counter(int idx, union sbi_pmu_ctr_info ctrinfo) +{ + unsigned long counter_val = 0; + + __GUEST_ASSERT(ctrinfo.type < 2, "Invalid counter type %d", ctrinfo.type); + + if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW) + counter_val = pmu_csr_read_num(ctrinfo.csr); + else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) + counter_val = read_fw_counter(idx, ctrinfo); + + return counter_val; +} + +static inline void verify_sbi_requirement_assert(void) +{ + long out_val = 0; + bool probe; + + probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val); + GUEST_ASSERT(probe && out_val == 1); + + if (get_host_sbi_spec_version() < sbi_mk_version(2, 0)) + __GUEST_ASSERT(0, "SBI implementation version doesn't support PMU Snapshot"); +} + +static void snapshot_set_shmem(vm_paddr_t gpa, unsigned long flags) +{ + unsigned long lo = (unsigned long)gpa; +#if __riscv_xlen == 32 + unsigned long hi = (unsigned long)(gpa >> 32); +#else + unsigned long hi = gpa == -1 ? -1 : 0; +#endif + struct sbiret ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, + lo, hi, flags, 0, 0, 0); + + GUEST_ASSERT(ret.value == 0 && ret.error == 0); +} + +static void test_pmu_event(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_pre, counter_value_post; + unsigned long counter_init_value = 100; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + /* Do not set the initial value */ + start_counter(counter, 0, 0); + dummy_func_loop(10000); + stop_counter(counter, 0); + + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_post > counter_value_pre, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* + * We can't just update the counter without starting it. + * Do start/stop twice to simulate that by first initializing to a very + * high value and a low value after that. + */ + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, ULONG_MAX/2); + stop_counter(counter, 0); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value); + stop_counter(counter, 0); + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_pre > counter_value_post, + "Counter reinitialization verification failed : post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* Now set the initial value and compare */ + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value); + dummy_func_loop(10000); + stop_counter(counter, 0); + + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_post > counter_init_value, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_pmu_event_snapshot(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_pre, counter_value_post; + unsigned long counter_init_value = 100; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + /* Do not set the initial value */ + start_counter(counter, 0, 0); + dummy_func_loop(10000); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + /* The counter value is updated w.r.t relative index of cbase */ + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_post > counter_value_pre, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* + * We can't just update the counter without starting it. + * Do start/stop twice to simulate that by first initializing to a very + * high value and a low value after that. + */ + WRITE_ONCE(snapshot_data->ctr_values[0], ULONG_MAX/2); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + counter_value_pre = READ_ONCE(snapshot_data->ctr_values[0]); + + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_pre > counter_value_post, + "Counter reinitialization verification failed : post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* Now set the initial value and compare */ + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + dummy_func_loop(10000); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_post > counter_init_value, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_pmu_event_overflow(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_post; + unsigned long counter_init_value = ULONG_MAX - 10000; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_in_use = counter; + + /* The counter value is updated w.r.t relative index of cbase passed to start/stop */ + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + dummy_func_loop(10000); + udelay(msecs_to_usecs(2000)); + /* irq handler should have stopped the counter */ + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + /* The counter value after stopping should be less the init value due to overflow */ + __GUEST_ASSERT(counter_value_post < counter_init_value, + "counter_value_post %lx counter_init_value %lx for counter\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_invalid_event(void) +{ + struct sbiret ret; + unsigned long event = 0x1234; /* A random event */ + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, 0, + counter_mask_available, 0, event, 0, 0); + GUEST_ASSERT_EQ(ret.error, SBI_ERR_NOT_SUPPORTED); +} + +static void test_pmu_events(void) +{ + int num_counters = 0; + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* Sanity testing for any random invalid event */ + test_invalid_event(); + + /* Only these two events are guaranteed to be present */ + test_pmu_event(SBI_PMU_HW_CPU_CYCLES); + test_pmu_event(SBI_PMU_HW_INSTRUCTIONS); + + GUEST_DONE(); +} + +static void test_pmu_basic_sanity(void) +{ + long out_val = 0; + bool probe; + struct sbiret ret; + int num_counters = 0, i; + union sbi_pmu_ctr_info ctrinfo; + + probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val); + GUEST_ASSERT(probe && out_val == 1); + + num_counters = get_num_counters(); + + for (i = 0; i < num_counters; i++) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, + 0, 0, 0, 0, 0); + + /* There can be gaps in logical counter indicies*/ + if (ret.error) + continue; + GUEST_ASSERT_NE(ret.value, 0); + + ctrinfo.value = ret.value; + + /** + * Accessibility check of hardware and read capability of firmware counters. + * The spec doesn't mandate any initial value. No need to check any value. + */ + if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW) { + pmu_csr_read_num(ctrinfo.csr); + GUEST_ASSERT(illegal_handler_invoked); + } else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) { + read_fw_counter(i, ctrinfo); + } + } + + GUEST_DONE(); +} + +static void test_pmu_events_snaphost(void) +{ + int num_counters = 0; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + int i; + + /* Verify presence of SBI PMU and minimum requrired SBI version */ + verify_sbi_requirement_assert(); + + snapshot_set_shmem(snapshot_gpa, 0); + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* Validate shared memory access */ + GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_overflow_mask), 0); + for (i = 0; i < num_counters; i++) { + if (counter_mask_available & (BIT(i))) + GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_values[i]), 0); + } + /* Only these two events are guranteed to be present */ + test_pmu_event_snapshot(SBI_PMU_HW_CPU_CYCLES); + test_pmu_event_snapshot(SBI_PMU_HW_INSTRUCTIONS); + + GUEST_DONE(); +} + +static void test_pmu_events_overflow(void) +{ + int num_counters = 0; + + /* Verify presence of SBI PMU and minimum requrired SBI version */ + verify_sbi_requirement_assert(); + + snapshot_set_shmem(snapshot_gpa, 0); + csr_set(CSR_IE, BIT(IRQ_PMU_OVF)); + local_irq_enable(); + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* + * Qemu supports overflow for cycle/instruction. + * This test may fail on any platform that do not support overflow for these two events. + */ + test_pmu_event_overflow(SBI_PMU_HW_CPU_CYCLES); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, 1); + + test_pmu_event_overflow(SBI_PMU_HW_INSTRUCTIONS); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, 2); + + GUEST_DONE(); +} + +static void run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + case UCALL_SYNC: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + break; + } +} + +void test_vm_destroy(struct kvm_vm *vm) +{ + memset(ctrinfo_arr, 0, sizeof(union sbi_pmu_ctr_info) * RISCV_MAX_PMU_COUNTERS); + counter_mask_available = 0; + kvm_vm_free(vm); +} + +static void test_vm_basic_test(void *guest_code) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + vm_init_vector_tables(vm); + /* Illegal instruction handler is required to verify read access without configuration */ + vm_install_exception_handler(vm, EXC_INST_ILLEGAL, guest_illegal_exception_handler); + + vcpu_init_vector_tables(vcpu); + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_events_test(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu = NULL; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_setup_snapshot_mem(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + /* PMU Snapshot requires single page only */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, PMU_SNAPSHOT_GPA_BASE, 1, 1, 0); + /* PMU_SNAPSHOT_GPA_BASE is identity mapped */ + virt_map(vm, PMU_SNAPSHOT_GPA_BASE, PMU_SNAPSHOT_GPA_BASE, 1); + + snapshot_gva = (void *)(PMU_SNAPSHOT_GPA_BASE); + snapshot_gpa = addr_gva2gpa(vcpu->vm, (vm_vaddr_t)snapshot_gva); + sync_global_to_guest(vcpu->vm, snapshot_gva); + sync_global_to_guest(vcpu->vm, snapshot_gpa); +} + +static void test_vm_events_snapshot_test(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + + test_vm_setup_snapshot_mem(vm, vcpu); + + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_events_overflow(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + + __TEST_REQUIRE(__vcpu_has_isa_ext(vcpu, KVM_RISCV_ISA_EXT_SSCOFPMF), + "Sscofpmf is not available, skipping overflow test"); + + test_vm_setup_snapshot_mem(vm, vcpu); + vm_init_vector_tables(vm); + vm_install_interrupt_handler(vm, guest_irq_handler); + + vcpu_init_vector_tables(vcpu); + /* Initialize guest timer frequency. */ + vcpu_get_reg(vcpu, RISCV_TIMER_REG(frequency), &timer_freq); + sync_global_to_guest(vm, timer_freq); + + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_print_help(char *name) +{ + pr_info("Usage: %s [-h] [-d <test name>]\n", name); + pr_info("\t-d: Test to disable. Available tests are 'basic', 'events', 'snapshot', 'overflow'\n"); + pr_info("\t-h: print this help screen\n"); +} + +static bool parse_args(int argc, char *argv[]) +{ + int opt; + + while ((opt = getopt(argc, argv, "hd:")) != -1) { + switch (opt) { + case 'd': + if (!strncmp("basic", optarg, 5)) + disabled_tests |= SBI_PMU_TEST_BASIC; + else if (!strncmp("events", optarg, 6)) + disabled_tests |= SBI_PMU_TEST_EVENTS; + else if (!strncmp("snapshot", optarg, 8)) + disabled_tests |= SBI_PMU_TEST_SNAPSHOT; + else if (!strncmp("overflow", optarg, 8)) + disabled_tests |= SBI_PMU_TEST_OVERFLOW; + else + goto done; + break; + case 'h': + default: + goto done; + } + } + + return true; +done: + test_print_help(argv[0]); + return false; +} + +int main(int argc, char *argv[]) +{ + if (!parse_args(argc, argv)) + exit(KSFT_SKIP); + + if (!(disabled_tests & SBI_PMU_TEST_BASIC)) { + test_vm_basic_test(test_pmu_basic_sanity); + pr_info("SBI PMU basic test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_EVENTS)) { + test_vm_events_test(test_pmu_events); + pr_info("SBI PMU event verification test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_SNAPSHOT)) { + test_vm_events_snapshot_test(test_pmu_events_snaphost); + pr_info("SBI PMU event verification with snapshot test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_OVERFLOW)) { + test_vm_events_overflow(test_pmu_events_overflow); + pr_info("SBI PMU event verification with overflow test : PASS\n"); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index bd57d991e27d..68c899d27561 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -339,7 +339,7 @@ static void test_invalid_memory_region_flags(void) #ifdef __x86_64__ if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); else #endif vm = vm_create_barebones(); @@ -462,7 +462,7 @@ static void test_add_private_memory_region(void) pr_info("Testing ADD of KVM_MEM_GUEST_MEMFD memory regions\n"); - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail"); test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail"); @@ -471,7 +471,7 @@ static void test_add_private_memory_region(void) test_invalid_guest_memfd(vm, memfd, 0, "Regular memfd() should fail"); close(memfd); - vm2 = vm_create_barebones_protected_vm(); + vm2 = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); memfd = vm_create_guest_memfd(vm2, MEM_REGION_SIZE, 0); test_invalid_guest_memfd(vm, memfd, 0, "Other VM's guest_memfd() should fail"); @@ -499,7 +499,7 @@ static void test_add_overlapping_private_memory_regions(void) pr_info("Testing ADD of overlapping KVM_MEM_GUEST_MEMFD memory regions\n"); - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE * 4, 0); diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index bae0c5026f82..2ff82c7fd926 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -11,7 +11,9 @@ #include <pthread.h> #include <linux/kernel.h> #include <asm/kvm.h> -#ifndef __riscv +#ifdef __riscv +#include "sbi.h" +#else #include <asm/kvm_para.h> #endif diff --git a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c new file mode 100644 index 000000000000..7a4a61be119b --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kvm.h> +#include <linux/psp-sev.h> +#include <stdio.h> +#include <sys/ioctl.h> +#include <stdlib.h> +#include <errno.h> +#include <pthread.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "kselftest.h" + +#define SVM_SEV_FEAT_DEBUG_SWAP 32u + +/* + * Some features may have hidden dependencies, or may only work + * for certain VM types. Err on the side of safety and don't + * expect that all supported features can be passed one by one + * to KVM_SEV_INIT2. + * + * (Well, right now there's only one...) + */ +#define KNOWN_FEATURES SVM_SEV_FEAT_DEBUG_SWAP + +int kvm_fd; +u64 supported_vmsa_features; +bool have_sev_es; + +static int __sev_ioctl(int vm_fd, int cmd_id, void *data) +{ + struct kvm_sev_cmd cmd = { + .id = cmd_id, + .data = (uint64_t)data, + .sev_fd = open_sev_dev_path_or_exit(), + }; + int ret; + + ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd); + TEST_ASSERT(ret < 0 || cmd.error == SEV_RET_SUCCESS, + "%d failed: fw error: %d\n", + cmd_id, cmd.error); + + return ret; +} + +static void test_init2(unsigned long vm_type, struct kvm_sev_init *init) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones_type(vm_type); + ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init); + TEST_ASSERT(ret == 0, + "KVM_SEV_INIT2 return code is %d (expected 0), errno: %d", + ret, errno); + kvm_vm_free(vm); +} + +static void test_init2_invalid(unsigned long vm_type, struct kvm_sev_init *init, const char *msg) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones_type(vm_type); + ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "KVM_SEV_INIT2 should fail, %s.", + msg); + kvm_vm_free(vm); +} + +void test_vm_types(void) +{ + test_init2(KVM_X86_SEV_VM, &(struct kvm_sev_init){}); + + /* + * TODO: check that unsupported types cannot be created. Probably + * a separate selftest. + */ + if (have_sev_es) + test_init2(KVM_X86_SEV_ES_VM, &(struct kvm_sev_init){}); + + test_init2_invalid(0, &(struct kvm_sev_init){}, + "VM type is KVM_X86_DEFAULT_VM"); + if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) + test_init2_invalid(KVM_X86_SW_PROTECTED_VM, &(struct kvm_sev_init){}, + "VM type is KVM_X86_SW_PROTECTED_VM"); +} + +void test_flags(uint32_t vm_type) +{ + int i; + + for (i = 0; i < 32; i++) + test_init2_invalid(vm_type, + &(struct kvm_sev_init){ .flags = BIT(i) }, + "invalid flag"); +} + +void test_features(uint32_t vm_type, uint64_t supported_features) +{ + int i; + + for (i = 0; i < 64; i++) { + if (!(supported_features & (1u << i))) + test_init2_invalid(vm_type, + &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }, + "unknown feature"); + else if (KNOWN_FEATURES & (1u << i)) + test_init2(vm_type, + &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }); + } +} + +int main(int argc, char *argv[]) +{ + int kvm_fd = open_kvm_dev_path_or_exit(); + bool have_sev; + + TEST_REQUIRE(__kvm_has_device_attr(kvm_fd, KVM_X86_GRP_SEV, + KVM_X86_SEV_VMSA_FEATURES) == 0); + kvm_device_attr_get(kvm_fd, KVM_X86_GRP_SEV, + KVM_X86_SEV_VMSA_FEATURES, + &supported_vmsa_features); + + have_sev = kvm_cpu_has(X86_FEATURE_SEV); + TEST_ASSERT(have_sev == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)), + "sev: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", + kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_VM); + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)); + have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES); + + TEST_ASSERT(have_sev_es == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM)), + "sev-es: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", + kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_ES_VM); + + test_vm_types(); + + test_flags(KVM_X86_SEV_VM); + if (have_sev_es) + test_flags(KVM_X86_SEV_ES_VM); + + test_features(KVM_X86_SEV_VM, 0); + if (have_sev_es) + test_features(KVM_X86_SEV_ES_VM, supported_vmsa_features); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c index 026779f3ed06..7c70c0da4fb7 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c @@ -4,6 +4,7 @@ #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> +#include <math.h> #include "test_util.h" #include "kvm_util.h" @@ -13,6 +14,8 @@ #include "sev.h" +#define XFEATURE_MASK_X87_AVX (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM) + static void guest_sev_es_code(void) { /* TODO: Check CPUID after GHCB-based hypercall support is added. */ @@ -35,13 +38,98 @@ static void guest_sev_code(void) GUEST_DONE(); } +/* Stash state passed via VMSA before any compiled code runs. */ +extern void guest_code_xsave(void); +asm("guest_code_xsave:\n" + "mov $-1, %eax\n" + "mov $-1, %edx\n" + "xsave (%rdi)\n" + "jmp guest_sev_es_code"); + +static void compare_xsave(u8 *from_host, u8 *from_guest) +{ + int i; + bool bad = false; + for (i = 0; i < 4095; i++) { + if (from_host[i] != from_guest[i]) { + printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]); + bad = true; + } + } + + if (bad) + abort(); +} + +static void test_sync_vmsa(uint32_t policy) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t gva; + void *hva; + + double x87val = M_PI; + struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 }; + struct kvm_sregs sregs; + struct kvm_xcrs xcrs = { + .nr_xcrs = 1, + .xcrs[0].xcr = 0, + .xcrs[0].value = XFEATURE_MASK_X87_AVX, + }; + + vm = vm_sev_create_with_one_vcpu(KVM_X86_SEV_ES_VM, guest_code_xsave, &vcpu); + gva = vm_vaddr_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR, + MEM_REGION_TEST_DATA); + hva = addr_gva2hva(vm, gva); + + vcpu_args_set(vcpu, 1, gva); + + vcpu_sregs_get(vcpu, &sregs); + sregs.cr4 |= X86_CR4_OSFXSR | X86_CR4_OSXSAVE; + vcpu_sregs_set(vcpu, &sregs); + + vcpu_xcrs_set(vcpu, &xcrs); + asm("fninit\n" + "vpcmpeqb %%ymm4, %%ymm4, %%ymm4\n" + "fldl %3\n" + "xsave (%2)\n" + "fstp %%st\n" + : "=m"(xsave) + : "A"(XFEATURE_MASK_X87_AVX), "r"(&xsave), "m" (x87val) + : "ymm4", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); + vcpu_xsave_set(vcpu, &xsave); + + vm_sev_launch(vm, SEV_POLICY_ES | policy, NULL); + + /* This page is shared, so make it decrypted. */ + memset(hva, 0, 4096); + + vcpu_run(vcpu); + + TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT, + "Wanted SYSTEM_EVENT, got %s", + exit_reason_str(vcpu->run->exit_reason)); + TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM); + TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1); + TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ); + + compare_xsave((u8 *)&xsave, (u8 *)hva); + + kvm_vm_free(vm); +} + static void test_sev(void *guest_code, uint64_t policy) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; - vm = vm_sev_create_with_one_vcpu(policy, guest_code, &vcpu); + uint32_t type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM; + + vm = vm_sev_create_with_one_vcpu(type, guest_code, &vcpu); + + /* TODO: Validate the measurement is as expected. */ + vm_sev_launch(vm, policy, NULL); for (;;) { vcpu_run(vcpu); @@ -82,6 +170,12 @@ int main(int argc, char *argv[]) if (kvm_cpu_has(X86_FEATURE_SEV_ES)) { test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); test_sev(guest_sev_es_code, SEV_POLICY_ES); + + if (kvm_has_cap(KVM_CAP_XCRS) && + (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) { + test_sync_vmsa(0); + test_sync_vmsa(SEV_POLICY_NO_DBG); + } } return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ff0a20565f90..38694f95bb56 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -401,12 +401,17 @@ static void kvm_flush_shadow_all(struct kvm *kvm) static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, gfp_t gfp_flags) { + void *page; + gfp_flags |= mc->gfp_zero; if (mc->kmem_cache) return kmem_cache_alloc(mc->kmem_cache, gfp_flags); - else - return (void *)__get_free_page(gfp_flags); + + page = (void *)__get_free_page(gfp_flags); + if (page && mc->init_value) + memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64)); + return page; } int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) @@ -421,6 +426,13 @@ int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, if (WARN_ON_ONCE(!capacity)) return -EIO; + /* + * Custom init values can be used only for page allocations, + * and obviously conflict with __GFP_ZERO. + */ + if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero))) + return -EIO; + mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp); if (!mc->objects) return -ENOMEM; @@ -583,8 +595,6 @@ static void kvm_null_fn(void) } #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) -static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG; - /* Iterate over each memslot intersecting [start, last] (inclusive) range */ #define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \ for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ @@ -670,14 +680,12 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, unsigned long start, unsigned long end, - union kvm_mmu_notifier_arg arg, gfn_handler_t handler) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range range = { .start = start, .end = end, - .arg = arg, .handler = handler, .on_lock = (void *)kvm_null_fn, .flush_on_ret = true, @@ -705,48 +713,6 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn return __kvm_handle_hva_range(kvm, &range).ret; } -static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - /* - * Skipping invalid memslots is correct if and only change_pte() is - * surrounded by invalidate_range_{start,end}(), which is currently - * guaranteed by the primary MMU. If that ever changes, KVM needs to - * unmap the memslot instead of skipping the memslot to ensure that KVM - * doesn't hold references to the old PFN. - */ - WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); - - if (range->slot->flags & KVM_MEMSLOT_INVALID) - return false; - - return kvm_set_spte_gfn(kvm, range); -} - -static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address, - pte_t pte) -{ - struct kvm *kvm = mmu_notifier_to_kvm(mn); - const union kvm_mmu_notifier_arg arg = { .pte = pte }; - - trace_kvm_set_spte_hva(address); - - /* - * .change_pte() must be surrounded by .invalidate_range_{start,end}(). - * If mmu_invalidate_in_progress is zero, then no in-progress - * invalidations, including this one, found a relevant memslot at - * start(); rechecking memslots here is unnecessary. Note, a false - * positive (count elevated by a different invalidation) is sub-optimal - * but functionally ok. - */ - WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); - if (!READ_ONCE(kvm->mmu_invalidate_in_progress)) - return; - - kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn); -} - void kvm_mmu_invalidate_begin(struct kvm *kvm) { lockdep_assert_held_write(&kvm->mmu_lock); @@ -909,8 +875,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, { trace_kvm_age_hva(start, end); - return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG, - kvm_age_gfn); + return kvm_handle_hva_range(mn, start, end, kvm_age_gfn); } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, @@ -963,7 +928,6 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .clear_flush_young = kvm_mmu_notifier_clear_flush_young, .clear_young = kvm_mmu_notifier_clear_young, .test_young = kvm_mmu_notifier_test_young, - .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; |