aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virt/kvm/api.rst2
-rw-r--r--Documentation/virt/kvm/x86/amd-memory-encryption.rst59
-rw-r--r--arch/arm64/kvm/mmu.c34
-rw-r--r--arch/loongarch/include/asm/kvm_host.h1
-rw-r--r--arch/loongarch/kvm/mmu.c32
-rw-r--r--arch/mips/kvm/mmu.c30
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h1
-rw-r--r--arch/powerpc/kvm/book3s.c5
-rw-r--r--arch/powerpc/kvm/book3s.h1
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c12
-rw-r--r--arch/powerpc/kvm/book3s_hv.c1
-rw-r--r--arch/powerpc/kvm/book3s_pr.c7
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c6
-rw-r--r--arch/riscv/include/asm/csr.h5
-rw-r--r--arch/riscv/include/asm/kvm_host.h21
-rw-r--r--arch/riscv/include/asm/kvm_vcpu_pmu.h16
-rw-r--r--arch/riscv/include/asm/sbi.h38
-rw-r--r--arch/riscv/include/uapi/asm/kvm.h1
-rw-r--r--arch/riscv/kernel/paravirt.c6
-rw-r--r--arch/riscv/kvm/aia.c5
-rw-r--r--arch/riscv/kvm/main.c18
-rw-r--r--arch/riscv/kvm/mmu.c20
-rw-r--r--arch/riscv/kvm/vcpu.c85
-rw-r--r--arch/riscv/kvm/vcpu_exit.c4
-rw-r--r--arch/riscv/kvm/vcpu_onereg.c6
-rw-r--r--arch/riscv/kvm/vcpu_pmu.c260
-rw-r--r--arch/riscv/kvm/vcpu_sbi.c7
-rw-r--r--arch/riscv/kvm/vcpu_sbi_hsm.c42
-rw-r--r--arch/riscv/kvm/vcpu_sbi_pmu.c17
-rw-r--r--arch/riscv/kvm/vcpu_sbi_sta.c4
-rw-r--r--arch/riscv/kvm/vm.c1
-rw-r--r--arch/x86/include/asm/fpu/api.h3
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h60
-rw-r--r--arch/x86/include/asm/sev-common.h8
-rw-r--r--arch/x86/include/asm/vmx.h13
-rw-r--r--arch/x86/include/uapi/asm/kvm.h22
-rw-r--r--arch/x86/kernel/fpu/xstate.c1
-rw-r--r--arch/x86/kernel/fpu/xstate.h2
-rw-r--r--arch/x86/kvm/Kconfig13
-rw-r--r--arch/x86/kvm/Makefile9
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/mmu/mmu.c270
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h28
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h2
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h14
-rw-r--r--arch/x86/kvm/mmu/spte.c40
-rw-r--r--arch/x86/kvm/mmu/spte.h26
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c64
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h1
-rw-r--r--arch/x86/kvm/svm/sev.c343
-rw-r--r--arch/x86/kvm/svm/svm.c36
-rw-r--r--arch/x86/kvm/svm/svm.h56
-rw-r--r--arch/x86/kvm/vmx/main.c166
-rw-r--r--arch/x86/kvm/vmx/vmcs.h5
-rw-r--r--arch/x86/kvm/vmx/vmx.c438
-rw-r--r--arch/x86/kvm/vmx/vmx.h6
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h124
-rw-r--r--arch/x86/kvm/x86.c234
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--drivers/perf/riscv_pmu.c3
-rw-r--r--drivers/perf/riscv_pmu_sbi.c316
-rw-r--r--include/linux/kvm_host.h2
-rw-r--r--include/linux/kvm_types.h1
-rw-r--r--include/linux/mmu_notifier.h44
-rw-r--r--include/linux/perf/riscv_pmu.h8
-rw-r--r--include/trace/events/kvm.h15
-rw-r--r--kernel/events/uprobes.c6
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memory.c7
-rw-r--r--mm/migrate_device.c8
-rw-r--r--mm/mmu_notifier.c17
-rw-r--r--tools/testing/selftests/kvm/Makefile3
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util_base.h11
-rw-r--r--tools/testing/selftests/kvm/include/riscv/processor.h49
-rw-r--r--tools/testing/selftests/kvm/include/riscv/sbi.h141
-rw-r--r--tools/testing/selftests/kvm/include/riscv/ucall.h1
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/processor.h6
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/sev.h19
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c1
-rw-r--r--tools/testing/selftests/kvm/lib/riscv/processor.c12
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c14
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/sev.c44
-rw-r--r--tools/testing/selftests/kvm/riscv/arch_timer.c2
-rw-r--r--tools/testing/selftests/kvm/riscv/ebreak_test.c82
-rw-r--r--tools/testing/selftests/kvm/riscv/get-reg-list.c4
-rw-r--r--tools/testing/selftests/kvm/riscv/sbi_pmu_test.c681
-rw-r--r--tools/testing/selftests/kvm/set_memory_region_test.c8
-rw-r--r--tools/testing/selftests/kvm/steal_time.c4
-rw-r--r--tools/testing/selftests/kvm/x86_64/sev_init2_tests.c152
-rw-r--r--tools/testing/selftests/kvm/x86_64/sev_smoke_test.c96
-rw-r--r--virt/kvm/kvm_main.c66
93 files changed, 3322 insertions, 1246 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 0b5a33ee71ee..f0b76ff5030d 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8819,6 +8819,8 @@ means the VM type with value @n is supported. Possible values of @n are::
#define KVM_X86_DEFAULT_VM 0
#define KVM_X86_SW_PROTECTED_VM 1
+ #define KVM_X86_SEV_VM 2
+ #define KVM_X86_SEV_ES_VM 3
Note, KVM_X86_SW_PROTECTED_VM is currently only for development and testing.
Do not use KVM_X86_SW_PROTECTED_VM for "real" VMs, and especially not in
diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
index 84335d119ff1..9677a0714a39 100644
--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
@@ -76,15 +76,56 @@ are defined in ``<linux/psp-dev.h>``.
KVM implements the following commands to support common lifecycle events of SEV
guests, such as launching, running, snapshotting, migrating and decommissioning.
-1. KVM_SEV_INIT
----------------
+1. KVM_SEV_INIT2
+----------------
-The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform
+The KVM_SEV_INIT2 command is used by the hypervisor to initialize the SEV platform
context. In a typical workflow, this command should be the first command issued.
+For this command to be accepted, either KVM_X86_SEV_VM or KVM_X86_SEV_ES_VM
+must have been passed to the KVM_CREATE_VM ioctl. A virtual machine created
+with those machine types in turn cannot be run until KVM_SEV_INIT2 is invoked.
+
+Parameters: struct kvm_sev_init (in)
Returns: 0 on success, -negative on error
+::
+
+ struct kvm_sev_init {
+ __u64 vmsa_features; /* initial value of features field in VMSA */
+ __u32 flags; /* must be 0 */
+ __u16 ghcb_version; /* maximum guest GHCB version allowed */
+ __u16 pad1;
+ __u32 pad2[8];
+ };
+
+It is an error if the hypervisor does not support any of the bits that
+are set in ``flags`` or ``vmsa_features``. ``vmsa_features`` must be
+0 for SEV virtual machines, as they do not have a VMSA.
+
+``ghcb_version`` must be 0 for SEV virtual machines, as they do not issue GHCB
+requests. If ``ghcb_version`` is 0 for any other guest type, then the maximum
+allowed guest GHCB protocol will default to version 2.
+
+This command replaces the deprecated KVM_SEV_INIT and KVM_SEV_ES_INIT commands.
+The commands did not have any parameters (the ```data``` field was unused) and
+only work for the KVM_X86_DEFAULT_VM machine type (0).
+
+They behave as if:
+
+* the VM type is KVM_X86_SEV_VM for KVM_SEV_INIT, or KVM_X86_SEV_ES_VM for
+ KVM_SEV_ES_INIT
+
+* the ``flags`` and ``vmsa_features`` fields of ``struct kvm_sev_init`` are
+ set to zero, and ``ghcb_version`` is set to 0 for KVM_SEV_INIT and 1 for
+ KVM_SEV_ES_INIT.
+
+If the ``KVM_X86_SEV_VMSA_FEATURES`` attribute does not exist, the hypervisor only
+supports KVM_SEV_INIT and KVM_SEV_ES_INIT. In that case, note that KVM_SEV_ES_INIT
+might set the debug swap VMSA feature (bit 5) depending on the value of the
+``debug_swap`` parameter of ``kvm-amd.ko``.
+
2. KVM_SEV_LAUNCH_START
-----------------------
@@ -425,6 +466,18 @@ issued by the hypervisor to make the guest ready for execution.
Returns: 0 on success, -negative on error
+Device attribute API
+====================
+
+Attributes of the SEV implementation can be retrieved through the
+``KVM_HAS_DEVICE_ATTR`` and ``KVM_GET_DEVICE_ATTR`` ioctls on the ``/dev/kvm``
+device node, using group ``KVM_X86_GRP_SEV``.
+
+Currently only one attribute is implemented:
+
+* ``KVM_X86_SEV_VMSA_FEATURES``: return the set of all bits that
+ are accepted in the ``vmsa_features`` of ``KVM_SEV_INIT2``.
+
Firmware Management
===================
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index dc04bc767865..ff17849be9f4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1768,40 +1768,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
return false;
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- kvm_pfn_t pfn = pte_pfn(range->arg.pte);
-
- if (!kvm->arch.mmu.pgt)
- return false;
-
- WARN_ON(range->end - range->start != 1);
-
- /*
- * If the page isn't tagged, defer to user_mem_abort() for sanitising
- * the MTE tags. The S2 pte should have been unmapped by
- * mmu_notifier_invalidate_range_end().
- */
- if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
- return false;
-
- /*
- * We've moved a page around, probably through CoW, so let's treat
- * it just like a translation fault and the map handler will clean
- * the cache to the PoC.
- *
- * The MMU notifiers will have unmapped a huge PMD before calling
- * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
- * therefore we never need to clear out a huge PMD through this
- * calling path and a memcache is not required.
- */
- kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
- PAGE_SIZE, __pfn_to_phys(pfn),
- KVM_PGTABLE_PROT_R, NULL, 0);
-
- return false;
-}
-
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index fc2edaefd688..c87b6ea0ec47 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -236,7 +236,6 @@ void kvm_flush_tlb_all(void);
void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa);
int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write);
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable);
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c
index a556cff35740..98883aa23ab8 100644
--- a/arch/loongarch/kvm/mmu.c
+++ b/arch/loongarch/kvm/mmu.c
@@ -494,38 +494,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
range->end << PAGE_SHIFT, &ctx);
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- unsigned long prot_bits;
- kvm_pte_t *ptep;
- kvm_pfn_t pfn = pte_pfn(range->arg.pte);
- gpa_t gpa = range->start << PAGE_SHIFT;
-
- ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);
- if (!ptep)
- return false;
-
- /* Replacing an absent or old page doesn't need flushes */
- if (!kvm_pte_present(NULL, ptep) || !kvm_pte_young(*ptep)) {
- kvm_set_pte(ptep, 0);
- return false;
- }
-
- /* Fill new pte if write protected or page migrated */
- prot_bits = _PAGE_PRESENT | __READABLE;
- prot_bits |= _CACHE_MASK & pte_val(range->arg.pte);
-
- /*
- * Set _PAGE_WRITE or _PAGE_DIRTY iff old and new pte both support
- * _PAGE_WRITE for map_page_fast if next page write fault
- * _PAGE_DIRTY since gpa has already recorded as dirty page
- */
- prot_bits |= __WRITEABLE & *ptep & pte_val(range->arg.pte);
- kvm_set_pte(ptep, kvm_pfn_pte(pfn, __pgprot(prot_bits)));
-
- return true;
-}
-
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
kvm_ptw_ctx ctx;
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 467ee6b95ae1..c17157e700c0 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -444,36 +444,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
return true;
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- gpa_t gpa = range->start << PAGE_SHIFT;
- pte_t hva_pte = range->arg.pte;
- pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
- pte_t old_pte;
-
- if (!gpa_pte)
- return false;
-
- /* Mapping may need adjusting depending on memslot flags */
- old_pte = *gpa_pte;
- if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
- hva_pte = pte_mkclean(hva_pte);
- else if (range->slot->flags & KVM_MEM_READONLY)
- hva_pte = pte_wrprotect(hva_pte);
-
- set_pte(gpa_pte, hva_pte);
-
- /* Replacing an absent or old page doesn't need flushes */
- if (!pte_present(old_pte) || !pte_young(old_pte))
- return false;
-
- /* Pages swapped, aged, moved, or cleaned require flushes */
- return !pte_present(hva_pte) ||
- !pte_young(hva_pte) ||
- pte_pfn(old_pte) != pte_pfn(hva_pte) ||
- (pte_dirty(old_pte) && !pte_dirty(hva_pte));
-}
-
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end);
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 3281215097cc..ca3829d47ab7 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -287,7 +287,6 @@ struct kvmppc_ops {
bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
- bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
void (*free_memslot)(struct kvm_memory_slot *slot);
int (*init_vm)(struct kvm *kvm);
void (*destroy_vm)(struct kvm *kvm);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 8acec144120e..0d0624088e6b 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -899,11 +899,6 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
-}
-
int kvmppc_core_init_vm(struct kvm *kvm)
{
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 58391b4b32ed..4aa2ab89afbc 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -12,7 +12,6 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
-extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 2b1f0cdd8c18..1b51b1c4713b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1010,18 +1010,6 @@ bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
return kvm_test_age_rmapp(kvm, range->slot, range->start);
}
-bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- WARN_ON(range->start + 1 != range->end);
-
- if (kvm_is_radix(kvm))
- kvm_unmap_radix(kvm, range->slot, range->start);
- else
- kvm_unmap_rmapp(kvm, range->slot, range->start);
-
- return false;
-}
-
static int vcpus_running(struct kvm *kvm)
{
return atomic_read(&kvm->arch.vcpus_running) != 0;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8e86eb577eb8..35cb014a0c51 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -6364,7 +6364,6 @@ static struct kvmppc_ops kvm_ops_hv = {
.unmap_gfn_range = kvm_unmap_gfn_range_hv,
.age_gfn = kvm_age_gfn_hv,
.test_age_gfn = kvm_test_age_gfn_hv,
- .set_spte_gfn = kvm_set_spte_gfn_hv,
.free_memslot = kvmppc_core_free_memslot_hv,
.init_vm = kvmppc_core_init_vm_hv,
.destroy_vm = kvmppc_core_destroy_vm_hv,
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 5b92619a05fd..a7d7137ea0c8 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -461,12 +461,6 @@ static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
return false;
}
-static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- /* The page will get remapped properly on its next fault */
- return do_kvm_unmap_gfn(kvm, range);
-}
-
/*****************************************/
static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
@@ -2071,7 +2065,6 @@ static struct kvmppc_ops kvm_ops_pr = {
.unmap_gfn_range = kvm_unmap_gfn_range_pr,
.age_gfn = kvm_age_gfn_pr,
.test_age_gfn = kvm_test_age_gfn_pr,
- .set_spte_gfn = kvm_set_spte_gfn_pr,
.free_memslot = kvmppc_core_free_memslot_pr,
.init_vm = kvmppc_core_init_vm_pr,
.destroy_vm = kvmppc_core_destroy_vm_pr,
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index ccb8f16ffe41..c664fdec75b1 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -747,12 +747,6 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
return false;
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- /* The page will get remapped properly on its next fault */
- return kvm_e500_mmu_unmap_gfn(kvm, range);
-}
-
/*****************************************/
int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500)
diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index 2468c55933cd..25966995da04 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -168,7 +168,8 @@
#define VSIP_TO_HVIP_SHIFT (IRQ_VS_SOFT - IRQ_S_SOFT)
#define VSIP_VALID_MASK ((_AC(1, UL) << IRQ_S_SOFT) | \
(_AC(1, UL) << IRQ_S_TIMER) | \
- (_AC(1, UL) << IRQ_S_EXT))
+ (_AC(1, UL) << IRQ_S_EXT) | \
+ (_AC(1, UL) << IRQ_PMU_OVF))
/* AIA CSR bits */
#define TOPI_IID_SHIFT 16
@@ -281,7 +282,7 @@
#define CSR_HPMCOUNTER30H 0xc9e
#define CSR_HPMCOUNTER31H 0xc9f
-#define CSR_SSCOUNTOVF 0xda0
+#define CSR_SCOUNTOVF 0xda0
#define CSR_SSTATUS 0x100
#define CSR_SIE 0x104
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 484d04a92fa6..d96281278586 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -43,6 +43,17 @@
KVM_ARCH_REQ_FLAGS(5, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(6)
+#define KVM_HEDELEG_DEFAULT (BIT(EXC_INST_MISALIGNED) | \
+ BIT(EXC_BREAKPOINT) | \
+ BIT(EXC_SYSCALL) | \
+ BIT(EXC_INST_PAGE_FAULT) | \
+ BIT(EXC_LOAD_PAGE_FAULT) | \
+ BIT(EXC_STORE_PAGE_FAULT))
+
+#define KVM_HIDELEG_DEFAULT (BIT(IRQ_VS_SOFT) | \
+ BIT(IRQ_VS_TIMER) | \
+ BIT(IRQ_VS_EXT))
+
enum kvm_riscv_hfence_type {
KVM_RISCV_HFENCE_UNKNOWN = 0,
KVM_RISCV_HFENCE_GVMA_VMID_GPA,
@@ -169,6 +180,7 @@ struct kvm_vcpu_csr {
struct kvm_vcpu_config {
u64 henvcfg;
u64 hstateen0;
+ unsigned long hedeleg;
};
struct kvm_vcpu_smstateen_csr {
@@ -211,6 +223,7 @@ struct kvm_vcpu_arch {
/* CPU context upon Guest VCPU reset */
struct kvm_cpu_context guest_reset_context;
+ spinlock_t reset_cntx_lock;
/* CPU CSR context upon Guest VCPU reset */
struct kvm_vcpu_csr guest_reset_csr;
@@ -252,8 +265,9 @@ struct kvm_vcpu_arch {
/* Cache pages needed to program page tables with spinlock held */
struct kvm_mmu_memory_cache mmu_page_cache;
- /* VCPU power-off state */
- bool power_off;
+ /* VCPU power state */
+ struct kvm_mp_state mp_state;
+ spinlock_t mp_state_lock;
/* Don't run the VCPU (blocked) */
bool pause;
@@ -374,8 +388,11 @@ int kvm_riscv_vcpu_unset_interrupt(struct kvm_vcpu *vcpu, unsigned int irq);
void kvm_riscv_vcpu_flush_interrupts(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu);
bool kvm_riscv_vcpu_has_interrupts(struct kvm_vcpu *vcpu, u64 mask);
+void __kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu);
+void __kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu);
+bool kvm_riscv_vcpu_stopped(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu);
diff --git a/arch/riscv/include/asm/kvm_vcpu_pmu.h b/arch/riscv/include/asm/kvm_vcpu_pmu.h
index 395518a1664e..fa0f535bbbf0 100644
--- a/arch/riscv/include/asm/kvm_vcpu_pmu.h
+++ b/arch/riscv/include/asm/kvm_vcpu_pmu.h
@@ -20,7 +20,7 @@ static_assert(RISCV_KVM_MAX_COUNTERS <= 64);
struct kvm_fw_event {
/* Current value of the event */
- unsigned long value;
+ u64 value;
/* Event monitoring status */
bool started;
@@ -36,6 +36,7 @@ struct kvm_pmc {
bool started;
/* Monitoring event ID */
unsigned long event_idx;
+ struct kvm_vcpu *vcpu;
};
/* PMU data structure per vcpu */
@@ -50,6 +51,12 @@ struct kvm_pmu {
bool init_done;
/* Bit map of all the virtual counter used */
DECLARE_BITMAP(pmc_in_use, RISCV_KVM_MAX_COUNTERS);
+ /* Bit map of all the virtual counter overflown */
+ DECLARE_BITMAP(pmc_overflown, RISCV_KVM_MAX_COUNTERS);
+ /* The address of the counter snapshot area (guest physical address) */
+ gpa_t snapshot_addr;
+ /* The actual data of the snapshot */
+ struct riscv_pmu_snapshot_data *sdata;
};
#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu_context)
@@ -82,9 +89,14 @@ int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba
unsigned long ctr_mask, unsigned long flags,
unsigned long eidx, u64 evtdata,
struct kvm_vcpu_sbi_return *retdata);
-int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
+int kvm_riscv_vcpu_pmu_fw_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
struct kvm_vcpu_sbi_return *retdata);
+int kvm_riscv_vcpu_pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx,
+ struct kvm_vcpu_sbi_return *retdata);
void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu);
+int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long saddr_low,
+ unsigned long saddr_high, unsigned long flags,
+ struct kvm_vcpu_sbi_return *retdata);
void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu);
diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index 6e68f8dff76b..112a0a0d9f46 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -131,6 +131,8 @@ enum sbi_ext_pmu_fid {
SBI_EXT_PMU_COUNTER_START,
SBI_EXT_PMU_COUNTER_STOP,
SBI_EXT_PMU_COUNTER_FW_READ,
+ SBI_EXT_PMU_COUNTER_FW_READ_HI,
+ SBI_EXT_PMU_SNAPSHOT_SET_SHMEM,
};
union sbi_pmu_ctr_info {
@@ -147,6 +149,13 @@ union sbi_pmu_ctr_info {
};
};
+/* Data structure to contain the pmu snapshot data */
+struct riscv_pmu_snapshot_data {
+ u64 ctr_overflow_mask;
+ u64 ctr_values[64];
+ u64 reserved[447];
+};
+
#define RISCV_PMU_RAW_EVENT_MASK GENMASK_ULL(47, 0)
#define RISCV_PMU_RAW_EVENT_IDX 0x20000
@@ -232,20 +241,22 @@ enum sbi_pmu_ctr_type {
#define SBI_PMU_EVENT_IDX_INVALID 0xFFFFFFFF
/* Flags defined for config matching function */
-#define SBI_PMU_CFG_FLAG_SKIP_MATCH (1 << 0)
-#define SBI_PMU_CFG_FLAG_CLEAR_VALUE (1 << 1)
-#define SBI_PMU_CFG_FLAG_AUTO_START (1 << 2)
-#define SBI_PMU_CFG_FLAG_SET_VUINH (1 << 3)
-#define SBI_PMU_CFG_FLAG_SET_VSINH (1 << 4)
-#define SBI_PMU_CFG_FLAG_SET_UINH (1 << 5)
-#define SBI_PMU_CFG_FLAG_SET_SINH (1 << 6)
-#define SBI_PMU_CFG_FLAG_SET_MINH (1 << 7)
+#define SBI_PMU_CFG_FLAG_SKIP_MATCH BIT(0)
+#define SBI_PMU_CFG_FLAG_CLEAR_VALUE BIT(1)
+#define SBI_PMU_CFG_FLAG_AUTO_START BIT(2)
+#define SBI_PMU_CFG_FLAG_SET_VUINH BIT(3)
+#define SBI_PMU_CFG_FLAG_SET_VSINH BIT(4)
+#define SBI_PMU_CFG_FLAG_SET_UINH BIT(5)
+#define SBI_PMU_CFG_FLAG_SET_SINH BIT(6)
+#define SBI_PMU_CFG_FLAG_SET_MINH BIT(7)
/* Flags defined for counter start function */
-#define SBI_PMU_START_FLAG_SET_INIT_VALUE (1 << 0)
+#define SBI_PMU_START_FLAG_SET_INIT_VALUE BIT(0)
+#define SBI_PMU_START_FLAG_INIT_SNAPSHOT BIT(1)
/* Flags defined for counter stop function */
-#define SBI_PMU_STOP_FLAG_RESET (1 << 0)
+#define SBI_PMU_STOP_FLAG_RESET BIT(0)
+#define SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT BIT(1)
enum sbi_ext_dbcn_fid {
SBI_EXT_DBCN_CONSOLE_WRITE = 0,
@@ -266,7 +277,7 @@ struct sbi_sta_struct {
u8 pad[47];
} __packed;
-#define SBI_STA_SHMEM_DISABLE -1
+#define SBI_SHMEM_DISABLE -1
/* SBI spec version fields */
#define SBI_SPEC_VERSION_DEFAULT 0x1
@@ -284,6 +295,7 @@ struct sbi_sta_struct {
#define SBI_ERR_ALREADY_AVAILABLE -6
#define SBI_ERR_ALREADY_STARTED -7
#define SBI_ERR_ALREADY_STOPPED -8
+#define SBI_ERR_NO_SHMEM -9
extern unsigned long sbi_spec_version;
struct sbiret {
@@ -355,8 +367,8 @@ static inline unsigned long sbi_minor_version(void)
static inline unsigned long sbi_mk_version(unsigned long major,
unsigned long minor)
{
- return ((major & SBI_SPEC_VERSION_MAJOR_MASK) <<
- SBI_SPEC_VERSION_MAJOR_SHIFT) | minor;
+ return ((major & SBI_SPEC_VERSION_MAJOR_MASK) << SBI_SPEC_VERSION_MAJOR_SHIFT)
+ | (minor & SBI_SPEC_VERSION_MINOR_MASK);
}
int sbi_err_map_linux_errno(int err);
diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
index b1c503c2959c..e878e7cc3978 100644
--- a/arch/riscv/include/uapi/asm/kvm.h
+++ b/arch/riscv/include/uapi/asm/kvm.h
@@ -167,6 +167,7 @@ enum KVM_RISCV_ISA_EXT_ID {
KVM_RISCV_ISA_EXT_ZFA,
KVM_RISCV_ISA_EXT_ZTSO,
KVM_RISCV_ISA_EXT_ZACAS,
+ KVM_RISCV_ISA_EXT_SSCOFPMF,
KVM_RISCV_ISA_EXT_MAX,
};
diff --git a/arch/riscv/kernel/paravirt.c b/arch/riscv/kernel/paravirt.c
index 0d6225fd3194..fa6b0339a65d 100644
--- a/arch/riscv/kernel/paravirt.c
+++ b/arch/riscv/kernel/paravirt.c
@@ -62,7 +62,7 @@ static int sbi_sta_steal_time_set_shmem(unsigned long lo, unsigned long hi,
ret = sbi_ecall(SBI_EXT_STA, SBI_EXT_STA_STEAL_TIME_SET_SHMEM,
lo, hi, flags, 0, 0, 0);
if (ret.error) {
- if (lo == SBI_STA_SHMEM_DISABLE && hi == SBI_STA_SHMEM_DISABLE)
+ if (lo == SBI_SHMEM_DISABLE && hi == SBI_SHMEM_DISABLE)
pr_warn("Failed to disable steal-time shmem");
else
pr_warn("Failed to set steal-time shmem");
@@ -84,8 +84,8 @@ static int pv_time_cpu_online(unsigned int cpu)
static int pv_time_cpu_down_prepare(unsigned int cpu)
{
- return sbi_sta_steal_time_set_shmem(SBI_STA_SHMEM_DISABLE,
- SBI_STA_SHMEM_DISABLE, 0);
+ return sbi_sta_steal_time_set_shmem(SBI_SHMEM_DISABLE,
+ SBI_SHMEM_DISABLE, 0);
}
static u64 pv_time_steal_clock(int cpu)
diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c
index a944294f6f23..0f0a9d11bb5f 100644
--- a/arch/riscv/kvm/aia.c
+++ b/arch/riscv/kvm/aia.c
@@ -545,6 +545,9 @@ void kvm_riscv_aia_enable(void)
enable_percpu_irq(hgei_parent_irq,
irq_get_trigger_type(hgei_parent_irq));
csr_set(CSR_HIE, BIT(IRQ_S_GEXT));
+ /* Enable IRQ filtering for overflow interrupt only if sscofpmf is present */
+ if (__riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSCOFPMF))
+ csr_write(CSR_HVIEN, BIT(IRQ_PMU_OVF));
}
void kvm_riscv_aia_disable(void)
@@ -558,6 +561,8 @@ void kvm_riscv_aia_disable(void)
return;
hgctrl = get_cpu_ptr(&aia_hgei);
+ if (__riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSCOFPMF))
+ csr_clear(CSR_HVIEN, BIT(IRQ_PMU_OVF));
/* Disable per-CPU SGEI interrupt */
csr_clear(CSR_HIE, BIT(IRQ_S_GEXT));
disable_percpu_irq(hgei_parent_irq);
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
index 225a435d9c9a..bab2ec34cd87 100644
--- a/arch/riscv/kvm/main.c
+++ b/arch/riscv/kvm/main.c
@@ -22,22 +22,8 @@ long kvm_arch_dev_ioctl(struct file *filp,
int kvm_arch_hardware_enable(void)
{
- unsigned long hideleg, hedeleg;
-
- hedeleg = 0;
- hedeleg |= (1UL << EXC_INST_MISALIGNED);
- hedeleg |= (1UL << EXC_BREAKPOINT);
- hedeleg |= (1UL << EXC_SYSCALL);
- hedeleg |= (1UL << EXC_INST_PAGE_FAULT);
- hedeleg |= (1UL << EXC_LOAD_PAGE_FAULT);
- hedeleg |= (1UL << EXC_STORE_PAGE_FAULT);
- csr_write(CSR_HEDELEG, hedeleg);
-
- hideleg = 0;
- hideleg |= (1UL << IRQ_VS_SOFT);
- hideleg |= (1UL << IRQ_VS_TIMER);
- hideleg |= (1UL << IRQ_VS_EXT);
- csr_write(CSR_HIDELEG, hideleg);
+ csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT);
+ csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT);
/* VS should access only the time counter directly. Everything else should trap */
csr_write(CSR_HCOUNTEREN, 0x02);
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index a9e2fd7245e1..b63650f9b966 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -550,26 +550,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
return false;
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- int ret;
- kvm_pfn_t pfn = pte_pfn(range->arg.pte);
-
- if (!kvm->arch.pgd)
- return false;
-
- WARN_ON(range->end - range->start != 1);
-
- ret = gstage_map_page(kvm, NULL, range->start << PAGE_SHIFT,
- __pfn_to_phys(pfn), PAGE_SIZE, true, true);
- if (ret) {
- kvm_debug("Failed to map G-stage page (error %d)\n", ret);
- return true;
- }
-
- return false;
-}
-
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
pte_t *ptep;
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index b5ca9f2e98ac..17e21df36cc1 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -64,7 +64,9 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu)
memcpy(csr, reset_csr, sizeof(*csr));
+ spin_lock(&vcpu->arch.reset_cntx_lock);
memcpy(cntx, reset_cntx, sizeof(*cntx));
+ spin_unlock(&vcpu->arch.reset_cntx_lock);
kvm_riscv_vcpu_fp_reset(vcpu);
@@ -102,6 +104,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *cntx;
struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr;
+ spin_lock_init(&vcpu->arch.mp_state_lock);
+
/* Mark this VCPU never ran */
vcpu->arch.ran_atleast_once = false;
vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
@@ -119,12 +123,16 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
spin_lock_init(&vcpu->arch.hfence_lock);
/* Setup reset state of shadow SSTATUS and HSTATUS CSRs */
+ spin_lock_init(&vcpu->arch.reset_cntx_lock);
+
+ spin_lock(&vcpu->arch.reset_cntx_lock);
cntx = &vcpu->arch.guest_reset_context;
cntx->sstatus = SR_SPP | SR_SPIE;
cntx->hstatus = 0;
cntx->hstatus |= HSTATUS_VTW;
cntx->hstatus |= HSTATUS_SPVP;
cntx->hstatus |= HSTATUS_SPV;
+ spin_unlock(&vcpu->arch.reset_cntx_lock);
if (kvm_riscv_vcpu_alloc_vector_context(vcpu, cntx))
return -ENOMEM;
@@ -201,7 +209,7 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
return (kvm_riscv_vcpu_has_interrupts(vcpu, -1UL) &&
- !vcpu->arch.power_off && !vcpu->arch.pause);
+ !kvm_riscv_vcpu_stopped(vcpu) && !vcpu->arch.pause);
}
int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
@@ -365,6 +373,13 @@ void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu)
}
}
+ /* Sync up the HVIP.LCOFIP bit changes (only clear) by the guest */
+ if ((csr->hvip ^ hvip) & (1UL << IRQ_PMU_OVF)) {
+ if (!(hvip & (1UL << IRQ_PMU_OVF)) &&
+ !test_and_set_bit(IRQ_PMU_OVF, v->irqs_pending_mask))
+ clear_bit(IRQ_PMU_OVF, v->irqs_pending);
+ }
+
/* Sync-up AIA high interrupts */
kvm_riscv_vcpu_aia_sync_interrupts(vcpu);
@@ -382,7 +397,8 @@ int kvm_riscv_vcpu_set_interrupt(struct kvm_vcpu *vcpu, unsigned int irq)
if (irq < IRQ_LOCAL_MAX &&
irq != IRQ_VS_SOFT &&
irq != IRQ_VS_TIMER &&
- irq != IRQ_VS_EXT)
+ irq != IRQ_VS_EXT &&
+ irq != IRQ_PMU_OVF)
return -EINVAL;
set_bit(irq, vcpu->arch.irqs_pending);
@@ -397,14 +413,15 @@ int kvm_riscv_vcpu_set_interrupt(struct kvm_vcpu *vcpu, unsigned int irq)
int kvm_riscv_vcpu_unset_interrupt(struct kvm_vcpu *vcpu, unsigned int irq)
{
/*
- * We only allow VS-mode software, timer, and external
+ * We only allow VS-mode software, timer, counter overflow and external
* interrupts when irq is one of the local interrupts
* defined by RISC-V privilege specification.
*/
if (irq < IRQ_LOCAL_MAX &&
irq != IRQ_VS_SOFT &&
irq != IRQ_VS_TIMER &&
- irq != IRQ_VS_EXT)
+ irq != IRQ_VS_EXT &&
+ irq != IRQ_PMU_OVF)
return -EINVAL;
clear_bit(irq, vcpu->arch.irqs_pending);
@@ -429,26 +446,42 @@ bool kvm_riscv_vcpu_has_interrupts(struct kvm_vcpu *vcpu, u64 mask)
return kvm_riscv_vcpu_aia_has_interrupts(vcpu, mask);
}
-void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu)
+void __kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu)
{
- vcpu->arch.power_off = true;
+ WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
kvm_make_request(KVM_REQ_SLEEP, vcpu);
kvm_vcpu_kick(vcpu);
}
-void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu)
+void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->arch.mp_state_lock);
+ __kvm_riscv_vcpu_power_off(vcpu);
+ spin_unlock(&vcpu->arch.mp_state_lock);
+}
+
+void __kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu)
{
- vcpu->arch.power_off = false;
+ WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
kvm_vcpu_wake_up(vcpu);
}
+void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->arch.mp_state_lock);
+ __kvm_riscv_vcpu_power_on(vcpu);
+ spin_unlock(&vcpu->arch.mp_state_lock);
+}
+
+bool kvm_riscv_vcpu_stopped(struct kvm_vcpu *vcpu)
+{
+ return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED;
+}
+
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- if (vcpu->arch.power_off)
- mp_state->mp_state = KVM_MP_STATE_STOPPED;
- else
- mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+ *mp_state = READ_ONCE(vcpu->arch.mp_state);
return 0;
}
@@ -458,25 +491,36 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
{
int ret = 0;
+ spin_lock(&vcpu->arch.mp_state_lock);
+
switch (mp_state->mp_state) {
case KVM_MP_STATE_RUNNABLE:
- vcpu->arch.power_off = false;
+ WRITE_ONCE(vcpu->arch.mp_state, *mp_state);
break;
case KVM_MP_STATE_STOPPED:
- kvm_riscv_vcpu_power_off(vcpu);
+ __kvm_riscv_vcpu_power_off(vcpu);
break;
default:
ret = -EINVAL;
}
+ spin_unlock(&vcpu->arch.mp_state_lock);
+
return ret;
}
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
{
- /* TODO; To be implemented later. */
- return -EINVAL;
+ if (dbg->control & KVM_GUESTDBG_ENABLE) {
+ vcpu->guest_debug = dbg->control;
+ vcpu->arch.cfg.hedeleg &= ~BIT(EXC_BREAKPOINT);
+ } else {
+ vcpu->guest_debug = 0;
+ vcpu->arch.cfg.hedeleg |= BIT(EXC_BREAKPOINT);
+ }
+
+ return 0;
}
static void kvm_riscv_vcpu_setup_config(struct kvm_vcpu *vcpu)
@@ -505,6 +549,10 @@ static void kvm_riscv_vcpu_setup_config(struct kvm_vcpu *vcpu)
if (riscv_isa_extension_available(isa, SMSTATEEN))
cfg->hstateen0 |= SMSTATEEN0_SSTATEEN0;
}
+
+ cfg->hedeleg = KVM_HEDELEG_DEFAULT;
+ if (vcpu->guest_debug)
+ cfg->hedeleg &= ~BIT(EXC_BREAKPOINT);
}
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -519,6 +567,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
csr_write(CSR_VSEPC, csr->vsepc);
csr_write(CSR_VSCAUSE, csr->vscause);
csr_write(CSR_VSTVAL, csr->vstval);
+ csr_write(CSR_HEDELEG, cfg->hedeleg);
csr_write(CSR_HVIP, csr->hvip);
csr_write(CSR_VSATP, csr->vsatp);
csr_write(CSR_HENVCFG, cfg->henvcfg);
@@ -584,11 +633,11 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) {
kvm_vcpu_srcu_read_unlock(vcpu);
rcuwait_wait_event(wait,
- (!vcpu->arch.power_off) && (!vcpu->arch.pause),
+ (!kvm_riscv_vcpu_stopped(vcpu)) && (!vcpu->arch.pause),
TASK_INTERRUPTIBLE);
kvm_vcpu_srcu_read_lock(vcpu);
- if (vcpu->arch.power_off || vcpu->arch.pause) {
+ if (kvm_riscv_vcpu_stopped(vcpu) || vcpu->arch.pause) {
/*
* Awaken to handle a signal, request to
* sleep again later.
diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
index 2415722c01b8..5761f95abb60 100644
--- a/arch/riscv/kvm/vcpu_exit.c
+++ b/arch/riscv/kvm/vcpu_exit.c
@@ -204,6 +204,10 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV)
ret = kvm_riscv_vcpu_sbi_ecall(vcpu, run);
break;
+ case EXC_BREAKPOINT:
+ run->exit_reason = KVM_EXIT_DEBUG;
+ ret = 0;
+ break;
default:
break;
}
diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c
index 994adc26db4b..c676275ea0a0 100644
--- a/arch/riscv/kvm/vcpu_onereg.c
+++ b/arch/riscv/kvm/vcpu_onereg.c
@@ -36,6 +36,7 @@ static const unsigned long kvm_isa_ext_arr[] = {
/* Multi letter extensions (alphabetically sorted) */
KVM_ISA_EXT_ARR(SMSTATEEN),
KVM_ISA_EXT_ARR(SSAIA),
+ KVM_ISA_EXT_ARR(SSCOFPMF),
KVM_ISA_EXT_ARR(SSTC),
KVM_ISA_EXT_ARR(SVINVAL),
KVM_ISA_EXT_ARR(SVNAPOT),
@@ -99,6 +100,9 @@ static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext)
switch (ext) {
case KVM_RISCV_ISA_EXT_H:
return false;
+ case KVM_RISCV_ISA_EXT_SSCOFPMF:
+ /* Sscofpmf depends on interrupt filtering defined in ssaia */
+ return __riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSAIA);
case KVM_RISCV_ISA_EXT_V:
return riscv_v_vstate_ctrl_user_allowed();
default:
@@ -116,6 +120,8 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext)
case KVM_RISCV_ISA_EXT_C:
case KVM_RISCV_ISA_EXT_I:
case KVM_RISCV_ISA_EXT_M:
+ /* There is not architectural config bit to disable sscofpmf completely */
+ case KVM_RISCV_ISA_EXT_SSCOFPMF:
case KVM_RISCV_ISA_EXT_SSTC:
case KVM_RISCV_ISA_EXT_SVINVAL:
case KVM_RISCV_ISA_EXT_SVNAPOT:
diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c
index 86391a5061dd..04db1f993c47 100644
--- a/arch/riscv/kvm/vcpu_pmu.c
+++ b/arch/riscv/kvm/vcpu_pmu.c
@@ -14,6 +14,7 @@
#include <asm/csr.h>
#include <asm/kvm_vcpu_sbi.h>
#include <asm/kvm_vcpu_pmu.h>
+#include <asm/sbi.h>
#include <linux/bitops.h>
#define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs)
@@ -39,7 +40,7 @@ static u64 kvm_pmu_get_sample_period(struct kvm_pmc *pmc)
u64 sample_period;
if (!pmc->counter_val)
- sample_period = counter_val_mask + 1;
+ sample_period = counter_val_mask;
else
sample_period = (-pmc->counter_val) & counter_val_mask;
@@ -196,6 +197,36 @@ static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
return kvm_pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
}
+static int pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx,
+ unsigned long *out_val)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int fevent_code;
+
+ if (!IS_ENABLED(CONFIG_32BIT)) {
+ pr_warn("%s: should be invoked for only RV32\n", __func__);
+ return -EINVAL;
+ }
+
+ if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) {
+ pr_warn("Invalid counter id [%ld]during read\n", cidx);
+ return -EINVAL;
+ }
+
+ pmc = &kvpmu->pmc[cidx];
+
+ if (pmc->cinfo.type != SBI_PMU_CTR_TYPE_FW)
+ return -EINVAL;
+
+ fevent_code = get_event_code(pmc->event_idx);
+ pmc->counter_val = kvpmu->fw_event[fevent_code].value;
+
+ *out_val = pmc->counter_val >> 32;
+
+ return 0;
+}
+
static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
unsigned long *out_val)
{
@@ -204,6 +235,11 @@ static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
u64 enabled, running;
int fevent_code;
+ if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) {
+ pr_warn("Invalid counter id [%ld] during read\n", cidx);
+ return -EINVAL;
+ }
+
pmc = &kvpmu->pmc[cidx];
if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
@@ -229,8 +265,50 @@ static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ct
return 0;
}
-static int kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr,
- unsigned long flags, unsigned long eidx, unsigned long evtdata)
+static void kvm_riscv_pmu_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ struct kvm_vcpu *vcpu = pmc->vcpu;
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct riscv_pmu *rpmu = to_riscv_pmu(perf_event->pmu);
+ u64 period;
+
+ /*
+ * Stop the event counting by directly accessing the perf_event.
+ * Otherwise, this needs to deferred via a workqueue.
+ * That will introduce skew in the counter value because the actual
+ * physical counter would start after returning from this function.
+ * It will be stopped again once the workqueue is scheduled
+ */
+ rpmu->pmu.stop(perf_event, PERF_EF_UPDATE);
+
+ /*
+ * The hw counter would start automatically when this function returns.
+ * Thus, the host may continue to interrupt and inject it to the guest
+ * even without the guest configuring the next event. Depending on the hardware
+ * the host may have some sluggishness only if privilege mode filtering is not
+ * available. In an ideal world, where qemu is not the only capable hardware,
+ * this can be removed.
+ * FYI: ARM64 does this way while x86 doesn't do anything as such.
+ * TODO: Should we keep it for RISC-V ?
+ */
+ period = -(local64_read(&perf_event->count));
+
+ local64_set(&perf_event->hw.period_left, 0);
+ perf_event->attr.sample_period = period;
+ perf_event->hw.sample_period = period;
+
+ set_bit(pmc->idx, kvpmu->pmc_overflown);
+ kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_PMU_OVF);
+
+ rpmu->pmu.start(perf_event, PERF_EF_RELOAD);
+}
+
+static long kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr,
+ unsigned long flags, unsigned long eidx,
+ unsigned long evtdata)
{
struct perf_event *event;
@@ -247,7 +325,7 @@ static int kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr
*/
attr->sample_period = kvm_pmu_get_sample_period(pmc);
- event = perf_event_create_kernel_counter(attr, -1, current, NULL, pmc);
+ event = perf_event_create_kernel_counter(attr, -1, current, kvm_riscv_pmu_overflow, pmc);
if (IS_ERR(event)) {
pr_err("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event));
return PTR_ERR(event);
@@ -310,6 +388,80 @@ int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
return ret;
}
+static void kvm_pmu_clear_snapshot_area(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data);
+
+ if (kvpmu->sdata) {
+ if (kvpmu->snapshot_addr != INVALID_GPA) {
+ memset(kvpmu->sdata, 0, snapshot_area_size);
+ kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr,
+ kvpmu->sdata, snapshot_area_size);
+ } else {
+ pr_warn("snapshot address invalid\n");
+ }
+ kfree(kvpmu->sdata);
+ kvpmu->sdata = NULL;
+ }
+ kvpmu->snapshot_addr = INVALID_GPA;
+}
+
+int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long saddr_low,
+ unsigned long saddr_high, unsigned long flags,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data);
+ int sbiret = 0;
+ gpa_t saddr;
+ unsigned long hva;
+ bool writable;
+
+ if (!kvpmu || flags) {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ goto out;
+ }
+
+ if (saddr_low == SBI_SHMEM_DISABLE && saddr_high == SBI_SHMEM_DISABLE) {
+ kvm_pmu_clear_snapshot_area(vcpu);
+ return 0;
+ }
+
+ saddr = saddr_low;
+
+ if (saddr_high != 0) {
+ if (IS_ENABLED(CONFIG_32BIT))
+ saddr |= ((gpa_t)saddr_high << 32);
+ else
+ sbiret = SBI_ERR_INVALID_ADDRESS;
+ goto out;
+ }
+
+ hva = kvm_vcpu_gfn_to_hva_prot(vcpu, saddr >> PAGE_SHIFT, &writable);
+ if (kvm_is_error_hva(hva) || !writable) {
+ sbiret = SBI_ERR_INVALID_ADDRESS;
+ goto out;
+ }
+
+ kvpmu->sdata = kzalloc(snapshot_area_size, GFP_ATOMIC);
+ if (!kvpmu->sdata)
+ return -ENOMEM;
+
+ if (kvm_vcpu_write_guest(vcpu, saddr, kvpmu->sdata, snapshot_area_size)) {
+ kfree(kvpmu->sdata);
+ sbiret = SBI_ERR_FAILURE;
+ goto out;
+ }
+
+ kvpmu->snapshot_addr = saddr;
+
+out:
+ retdata->err_val = sbiret;
+
+ return 0;
+}
+
int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu,
struct kvm_vcpu_sbi_return *retdata)
{
@@ -343,20 +495,40 @@ int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
int i, pmc_index, sbiret = 0;
struct kvm_pmc *pmc;
int fevent_code;
+ bool snap_flag_set = flags & SBI_PMU_START_FLAG_INIT_SNAPSHOT;
if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
sbiret = SBI_ERR_INVALID_PARAM;
goto out;
}
+ if (snap_flag_set) {
+ if (kvpmu->snapshot_addr == INVALID_GPA) {
+ sbiret = SBI_ERR_NO_SHMEM;
+ goto out;
+ }
+ if (kvm_vcpu_read_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata,
+ sizeof(struct riscv_pmu_snapshot_data))) {
+ pr_warn("Unable to read snapshot shared memory while starting counters\n");
+ sbiret = SBI_ERR_FAILURE;
+ goto out;
+ }
+ }
/* Start the counters that have been configured and requested by the guest */
for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
pmc_index = i + ctr_base;
if (!test_bit(pmc_index, kvpmu->pmc_in_use))
continue;
+ /* The guest started the counter again. Reset the overflow status */
+ clear_bit(pmc_index, kvpmu->pmc_overflown);
pmc = &kvpmu->pmc[pmc_index];
- if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE)
+ if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE) {
pmc->counter_val = ival;
+ } else if (snap_flag_set) {
+ /* The counter index in the snapshot are relative to the counter base */
+ pmc->counter_val = kvpmu->sdata->ctr_values[i];
+ }
+
if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
fevent_code = get_event_code(pmc->event_idx);
if (fevent_code >= SBI_PMU_FW_MAX) {
@@ -400,12 +572,19 @@ int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
u64 enabled, running;
struct kvm_pmc *pmc;
int fevent_code;
+ bool snap_flag_set = flags & SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
+ bool shmem_needs_update = false;
if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
sbiret = SBI_ERR_INVALID_PARAM;
goto out;
}
+ if (snap_flag_set && kvpmu->snapshot_addr == INVALID_GPA) {
+ sbiret = SBI_ERR_NO_SHMEM;
+ goto out;
+ }
+
/* Stop the counters that have been configured and requested by the guest */
for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
pmc_index = i + ctr_base;
@@ -432,21 +611,49 @@ int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
sbiret = SBI_ERR_ALREADY_STOPPED;
}
- if (flags & SBI_PMU_STOP_FLAG_RESET) {
- /* Relase the counter if this is a reset request */
- pmc->counter_val += perf_event_read_value(pmc->perf_event,
- &enabled, &running);
+ if (flags & SBI_PMU_STOP_FLAG_RESET)
+ /* Release the counter if this is a reset request */
kvm_pmu_release_perf_event(pmc);
- }
} else {
sbiret = SBI_ERR_INVALID_PARAM;
}
+
+ if (snap_flag_set && !sbiret) {
+ if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW)
+ pmc->counter_val = kvpmu->fw_event[fevent_code].value;
+ else if (pmc->perf_event)
+ pmc->counter_val += perf_event_read_value(pmc->perf_event,
+ &enabled, &running);
+ /*
+ * The counter and overflow indicies in the snapshot region are w.r.to
+ * cbase. Modify the set bit in the counter mask instead of the pmc_index
+ * which indicates the absolute counter index.
+ */
+ if (test_bit(pmc_index, kvpmu->pmc_overflown))
+ kvpmu->sdata->ctr_overflow_mask |= BIT(i);
+ kvpmu->sdata->ctr_values[i] = pmc->counter_val;
+ shmem_needs_update = true;
+ }
+
if (flags & SBI_PMU_STOP_FLAG_RESET) {
pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
clear_bit(pmc_index, kvpmu->pmc_in_use);
+ clear_bit(pmc_index, kvpmu->pmc_overflown);
+ if (snap_flag_set) {
+ /*
+ * Only clear the given counter as the caller is responsible to
+ * validate both the overflow mask and configured counters.
+ */
+ kvpmu->sdata->ctr_overflow_mask &= ~BIT(i);
+ shmem_needs_update = true;
+ }
}
}
+ if (shmem_needs_update)
+ kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata,
+ sizeof(struct riscv_pmu_snapshot_data));
+
out:
retdata->err_val = sbiret;
@@ -458,7 +665,8 @@ int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba
unsigned long eidx, u64 evtdata,
struct kvm_vcpu_sbi_return *retdata)
{
- int ctr_idx, ret, sbiret = 0;
+ int ctr_idx, sbiret = 0;
+ long ret;
bool is_fevent;
unsigned long event_code;
u32 etype = kvm_pmu_get_perf_event_type(eidx);
@@ -517,8 +725,10 @@ int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba
kvpmu->fw_event[event_code].started = true;
} else {
ret = kvm_pmu_create_perf_event(pmc, &attr, flags, eidx, evtdata);
- if (ret)
- return ret;
+ if (ret) {
+ sbiret = SBI_ERR_NOT_SUPPORTED;
+ goto out;
+ }
}
set_bit(ctr_idx, kvpmu->pmc_in_use);
@@ -530,7 +740,19 @@ out:
return 0;
}
-int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
+int kvm_riscv_vcpu_pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ int ret;
+
+ ret = pmu_fw_ctr_read_hi(vcpu, cidx, &retdata->out_val);
+ if (ret == -EINVAL)
+ retdata->err_val = SBI_ERR_INVALID_PARAM;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_fw_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
struct kvm_vcpu_sbi_return *retdata)
{
int ret;
@@ -566,6 +788,7 @@ void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
kvpmu->num_hw_ctrs = num_hw_ctrs + 1;
kvpmu->num_fw_ctrs = SBI_PMU_FW_MAX;
memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
+ kvpmu->snapshot_addr = INVALID_GPA;
if (kvpmu->num_hw_ctrs > RISCV_KVM_MAX_HW_CTRS) {
pr_warn_once("Limiting the hardware counters to 32 as specified by the ISA");
@@ -585,6 +808,7 @@ void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
pmc = &kvpmu->pmc[i];
pmc->idx = i;
pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
+ pmc->vcpu = vcpu;
if (i < kvpmu->num_hw_ctrs) {
pmc->cinfo.type = SBI_PMU_CTR_TYPE_HW;
if (i < 3)
@@ -601,7 +825,7 @@ void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
pmc->cinfo.csr = CSR_CYCLE + i;
} else {
pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
- pmc->cinfo.width = BITS_PER_LONG - 1;
+ pmc->cinfo.width = 63;
}
}
@@ -617,14 +841,16 @@ void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
if (!kvpmu)
return;
- for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_MAX_COUNTERS) {
+ for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS) {
pmc = &kvpmu->pmc[i];
pmc->counter_val = 0;
kvm_pmu_release_perf_event(pmc);
pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
}
- bitmap_zero(kvpmu->pmc_in_use, RISCV_MAX_COUNTERS);
+ bitmap_zero(kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS);
+ bitmap_zero(kvpmu->pmc_overflown, RISCV_KVM_MAX_COUNTERS);
memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
+ kvm_pmu_clear_snapshot_area(vcpu);
}
void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c
index 72a2ffb8dcd1..62f409d4176e 100644
--- a/arch/riscv/kvm/vcpu_sbi.c
+++ b/arch/riscv/kvm/vcpu_sbi.c
@@ -138,8 +138,11 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
unsigned long i;
struct kvm_vcpu *tmp;
- kvm_for_each_vcpu(i, tmp, vcpu->kvm)
- tmp->arch.power_off = true;
+ kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
+ spin_lock(&vcpu->arch.mp_state_lock);
+ WRITE_ONCE(tmp->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
+ spin_unlock(&vcpu->arch.mp_state_lock);
+ }
kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
memset(&run->system_event, 0, sizeof(run->system_event));
diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c
index 7dca0e9381d9..dce667f4b6ab 100644
--- a/arch/riscv/kvm/vcpu_sbi_hsm.c
+++ b/arch/riscv/kvm/vcpu_sbi_hsm.c
@@ -18,13 +18,20 @@ static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
struct kvm_vcpu *target_vcpu;
unsigned long target_vcpuid = cp->a0;
+ int ret = 0;
target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid);
if (!target_vcpu)
return SBI_ERR_INVALID_PARAM;
- if (!target_vcpu->arch.power_off)
- return SBI_ERR_ALREADY_AVAILABLE;
+ spin_lock(&target_vcpu->arch.mp_state_lock);
+
+ if (!kvm_riscv_vcpu_stopped(target_vcpu)) {
+ ret = SBI_ERR_ALREADY_AVAILABLE;
+ goto out;
+ }
+
+ spin_lock(&target_vcpu->arch.reset_cntx_lock);
reset_cntx = &target_vcpu->arch.guest_reset_context;
/* start address */
reset_cntx->sepc = cp->a1;
@@ -32,21 +39,35 @@ static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu)
reset_cntx->a0 = target_vcpuid;
/* private data passed from kernel */
reset_cntx->a1 = cp->a2;
+ spin_unlock(&target_vcpu->arch.reset_cntx_lock);
+
kvm_make_request(KVM_REQ_VCPU_RESET, target_vcpu);
- kvm_riscv_vcpu_power_on(target_vcpu);
+ __kvm_riscv_vcpu_power_on(target_vcpu);
- return 0;
+out:
+ spin_unlock(&target_vcpu->arch.mp_state_lock);
+
+ return ret;
}
static int kvm_sbi_hsm_vcpu_stop(struct kvm_vcpu *vcpu)
{
- if (vcpu->arch.power_off)
- return SBI_ERR_FAILURE;
+ int ret = 0;
- kvm_riscv_vcpu_power_off(vcpu);
+ spin_lock(&vcpu->arch.mp_state_lock);
- return 0;
+ if (kvm_riscv_vcpu_stopped(vcpu)) {
+ ret = SBI_ERR_FAILURE;
+ goto out;
+ }
+
+ __kvm_riscv_vcpu_power_off(vcpu);
+
+out:
+ spin_unlock(&vcpu->arch.mp_state_lock);
+
+ return ret;
}
static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu)
@@ -58,7 +79,7 @@ static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu)
target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid);
if (!target_vcpu)
return SBI_ERR_INVALID_PARAM;
- if (!target_vcpu->arch.power_off)
+ if (!kvm_riscv_vcpu_stopped(target_vcpu))
return SBI_HSM_STATE_STARTED;
else if (vcpu->stat.generic.blocking)
return SBI_HSM_STATE_SUSPENDED;
@@ -71,14 +92,11 @@ static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
{
int ret = 0;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
- struct kvm *kvm = vcpu->kvm;
unsigned long funcid = cp->a6;
switch (funcid) {
case SBI_EXT_HSM_HART_START:
- mutex_lock(&kvm->lock);
ret = kvm_sbi_hsm_vcpu_start(vcpu);
- mutex_unlock(&kvm->lock);
break;
case SBI_EXT_HSM_HART_STOP:
ret = kvm_sbi_hsm_vcpu_stop(vcpu);
diff --git a/arch/riscv/kvm/vcpu_sbi_pmu.c b/arch/riscv/kvm/vcpu_sbi_pmu.c
index 7eca72df2cbd..e4be34e03e83 100644
--- a/arch/riscv/kvm/vcpu_sbi_pmu.c
+++ b/arch/riscv/kvm/vcpu_sbi_pmu.c
@@ -42,9 +42,9 @@ static int kvm_sbi_ext_pmu_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
#endif
/*
* This can fail if perf core framework fails to create an event.
- * Forward the error to userspace because it's an error which
- * happened within the host kernel. The other option would be
- * to convert to an SBI error and forward to the guest.
+ * No need to forward the error to userspace and exit the guest.
+ * The operation can continue without profiling. Forward the
+ * appropriate SBI error to the guest.
*/
ret = kvm_riscv_vcpu_pmu_ctr_cfg_match(vcpu, cp->a0, cp->a1,
cp->a2, cp->a3, temp, retdata);
@@ -62,7 +62,16 @@ static int kvm_sbi_ext_pmu_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
ret = kvm_riscv_vcpu_pmu_ctr_stop(vcpu, cp->a0, cp->a1, cp->a2, retdata);
break;
case SBI_EXT_PMU_COUNTER_FW_READ:
- ret = kvm_riscv_vcpu_pmu_ctr_read(vcpu, cp->a0, retdata);
+ ret = kvm_riscv_vcpu_pmu_fw_ctr_read(vcpu, cp->a0, retdata);
+ break;
+ case SBI_EXT_PMU_COUNTER_FW_READ_HI:
+ if (IS_ENABLED(CONFIG_32BIT))
+ ret = kvm_riscv_vcpu_pmu_fw_ctr_read_hi(vcpu, cp->a0, retdata);
+ else
+ retdata->out_val = 0;
+ break;
+ case SBI_EXT_PMU_SNAPSHOT_SET_SHMEM:
+ ret = kvm_riscv_vcpu_pmu_snapshot_set_shmem(vcpu, cp->a0, cp->a1, cp->a2, retdata);
break;
default:
retdata->err_val = SBI_ERR_NOT_SUPPORTED;
diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c
index d8cf9ca28c61..5f35427114c1 100644
--- a/arch/riscv/kvm/vcpu_sbi_sta.c
+++ b/arch/riscv/kvm/vcpu_sbi_sta.c
@@ -93,8 +93,8 @@ static int kvm_sbi_sta_steal_time_set_shmem(struct kvm_vcpu *vcpu)
if (flags != 0)
return SBI_ERR_INVALID_PARAM;
- if (shmem_phys_lo == SBI_STA_SHMEM_DISABLE &&
- shmem_phys_hi == SBI_STA_SHMEM_DISABLE) {
+ if (shmem_phys_lo == SBI_SHMEM_DISABLE &&
+ shmem_phys_hi == SBI_SHMEM_DISABLE) {
vcpu->arch.sta.shmem = INVALID_GPA;
return 0;
}
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
index ce58bc48e5b8..7396b8654f45 100644
--- a/arch/riscv/kvm/vm.c
+++ b/arch/riscv/kvm/vm.c
@@ -186,6 +186,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_READONLY_MEM:
case KVM_CAP_MP_STATE:
case KVM_CAP_IMMEDIATE_EXIT:
+ case KVM_CAP_SET_GUEST_DEBUG:
r = 1;
break;
case KVM_CAP_NR_VCPUS:
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index a2be3aefff9f..f86ad3335529 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -143,6 +143,9 @@ extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfe
extern u64 xstate_get_guest_group_perm(void);
+extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+
+
/* KVM specific functions */
extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 110d7f29ca9a..5187fcf4b610 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -121,6 +121,7 @@ KVM_X86_OP(enter_smm)
KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
#endif
+KVM_X86_OP_OPTIONAL(dev_get_attr)
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6efd1497b026..69862e8ecab3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -254,28 +254,31 @@ enum x86_intercept_stage;
KVM_GUESTDBG_INJECT_DB | \
KVM_GUESTDBG_BLOCKIRQ)
+#define PFERR_PRESENT_MASK BIT(0)
+#define PFERR_WRITE_MASK BIT(1)
+#define PFERR_USER_MASK BIT(2)
+#define PFERR_RSVD_MASK BIT(3)
+#define PFERR_FETCH_MASK BIT(4)
+#define PFERR_PK_MASK BIT(5)
+#define PFERR_SGX_MASK BIT(15)
+#define PFERR_GUEST_RMP_MASK BIT_ULL(31)
+#define PFERR_GUEST_FINAL_MASK BIT_ULL(32)
+#define PFERR_GUEST_PAGE_MASK BIT_ULL(33)
+#define PFERR_GUEST_ENC_MASK BIT_ULL(34)
+#define PFERR_GUEST_SIZEM_MASK BIT_ULL(35)
+#define PFERR_GUEST_VMPL_MASK BIT_ULL(36)
-#define PFERR_PRESENT_BIT 0
-#define PFERR_WRITE_BIT 1
-#define PFERR_USER_BIT 2
-#define PFERR_RSVD_BIT 3
-#define PFERR_FETCH_BIT 4
-#define PFERR_PK_BIT 5
-#define PFERR_SGX_BIT 15
-#define PFERR_GUEST_FINAL_BIT 32
-#define PFERR_GUEST_PAGE_BIT 33
-#define PFERR_IMPLICIT_ACCESS_BIT 48
-
-#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT)
-#define PFERR_USER_MASK BIT(PFERR_USER_BIT)
-#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT)
-#define PFERR_PK_MASK BIT(PFERR_PK_BIT)
-#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT)
-#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT)
-#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT)
-#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
+/*
+ * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks
+ * when emulating instructions that triggers implicit access.
+ */
+#define PFERR_IMPLICIT_ACCESS BIT_ULL(48)
+/*
+ * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred
+ * when the guest was accessing private memory.
+ */
+#define PFERR_PRIVATE_ACCESS BIT_ULL(49)
+#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
PFERR_WRITE_MASK | \
@@ -1280,12 +1283,14 @@ enum kvm_apicv_inhibit {
};
struct kvm_arch {
- unsigned long vm_type;
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
u8 mmu_valid_gen;
+ u8 vm_type;
+ bool has_private_mem;
+ bool has_protected_state;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
@@ -1312,6 +1317,8 @@ struct kvm_arch {
*/
spinlock_t mmu_unsync_pages_lock;
+ u64 shadow_mmio_value;
+
struct iommu_domain *iommu_domain;
bool iommu_noncoherent;
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
@@ -1779,6 +1786,7 @@ struct kvm_x86_ops {
void (*enable_smi_window)(struct kvm_vcpu *vcpu);
#endif
+ int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
@@ -1844,6 +1852,7 @@ struct kvm_arch_async_pf {
gfn_t gfn;
unsigned long cr3;
bool direct_map;
+ u64 error_code;
};
extern u32 __read_mostly kvm_nr_uret_msrs;
@@ -2140,6 +2149,10 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
}
+unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit, int cpl);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
@@ -2153,8 +2166,9 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
+
#ifdef CONFIG_KVM_PRIVATE_MEM
-#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM)
+#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#else
#define kvm_arch_has_private_mem(kvm) false
#endif
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index b463fcbd4b90..5a8246dd532f 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -54,8 +54,10 @@
(((unsigned long)fn) << 32))
/* AP Reset Hold */
-#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
-#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
+#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12
+#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0)
/* GHCB GPA Register */
#define GHCB_MSR_REG_GPA_REQ 0x012
@@ -99,6 +101,8 @@ enum psc_op {
/* GHCB Hypervisor Feature Request/Response */
#define GHCB_MSR_HV_FT_REQ 0x080
#define GHCB_MSR_HV_FT_RESP 0x081
+#define GHCB_MSR_HV_FT_POS 12
+#define GHCB_MSR_HV_FT_MASK GENMASK_ULL(51, 0)
#define GHCB_MSR_HV_FT_RESP_VAL(v) \
/* GHCBData[63:12] */ \
(((u64)(v) & GENMASK_ULL(63, 12)) >> 12)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 4dba17363008..d77a31039f24 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -71,6 +71,7 @@
#define SECONDARY_EXEC_ENCLS_EXITING VMCS_CONTROL_BIT(ENCLS_EXITING)
#define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING)
#define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
+#define SECONDARY_EXEC_EPT_VIOLATION_VE VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
#define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES)
#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
@@ -226,6 +227,8 @@ enum vmcs_field {
VMREAD_BITMAP_HIGH = 0x00002027,
VMWRITE_BITMAP = 0x00002028,
VMWRITE_BITMAP_HIGH = 0x00002029,
+ VE_INFORMATION_ADDRESS = 0x0000202A,
+ VE_INFORMATION_ADDRESS_HIGH = 0x0000202B,
XSS_EXIT_BITMAP = 0x0000202C,
XSS_EXIT_BITMAP_HIGH = 0x0000202D,
ENCLS_EXITING_BITMAP = 0x0000202E,
@@ -514,6 +517,7 @@ enum vmcs_field {
#define VMX_EPT_IPAT_BIT (1ull << 6)
#define VMX_EPT_ACCESS_BIT (1ull << 8)
#define VMX_EPT_DIRTY_BIT (1ull << 9)
+#define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63)
#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
VMX_EPT_WRITABLE_MASK | \
VMX_EPT_EXECUTABLE_MASK)
@@ -630,4 +634,13 @@ enum vmx_l1d_flush_state {
extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
+struct vmx_ve_information {
+ u32 exit_reason;
+ u32 delivery;
+ u64 exit_qualification;
+ u64 guest_linear_address;
+ u64 guest_physical_address;
+ u16 eptp_index;
+};
+
#endif
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index ef11aa4cab42..9fae1b73b529 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -457,8 +457,13 @@ struct kvm_sync_regs {
#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
-/* attributes for system fd (group 0) */
-#define KVM_X86_XCOMP_GUEST_SUPP 0
+/* vendor-independent attributes for system fd (group 0) */
+#define KVM_X86_GRP_SYSTEM 0
+# define KVM_X86_XCOMP_GUEST_SUPP 0
+
+/* vendor-specific groups and attributes for system fd */
+#define KVM_X86_GRP_SEV 1
+# define KVM_X86_SEV_VMSA_FEATURES 0
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
@@ -689,6 +694,9 @@ enum sev_cmd_id {
/* Guest Migration Extension */
KVM_SEV_SEND_CANCEL,
+ /* Second time is the charm; improved versions of the above ioctls. */
+ KVM_SEV_INIT2,
+
KVM_SEV_NR_MAX,
};
@@ -700,6 +708,14 @@ struct kvm_sev_cmd {
__u32 sev_fd;
};
+struct kvm_sev_init {
+ __u64 vmsa_features;
+ __u32 flags;
+ __u16 ghcb_version;
+ __u16 pad1;
+ __u32 pad2[8];
+};
+
struct kvm_sev_launch_start {
__u32 handle;
__u32 policy;
@@ -856,5 +872,7 @@ struct kvm_hyperv_eventfd {
#define KVM_X86_DEFAULT_VM 0
#define KVM_X86_SW_PROTECTED_VM 1
+#define KVM_X86_SEV_VM 2
+#define KVM_X86_SEV_ES_VM 3
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 33a214b1a4ce..6d32e415b01e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -991,6 +991,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
return __raw_xsave_addr(xsave, xfeature_nr);
}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
#ifdef CONFIG_ARCH_HAS_PKEYS
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 19ca623ffa2a..05df04f39628 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -54,8 +54,6 @@ extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void
extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system_xstate(unsigned int legacy_size);
-extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
-
static inline u64 xfeatures_mask_supervisor(void)
{
return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 0ebdd088f28b..d64fb2b3eb69 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -95,6 +95,19 @@ config KVM_INTEL
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config KVM_INTEL_PROVE_VE
+ bool "Check that guests do not receive #VE exceptions"
+ default KVM_PROVE_MMU || DEBUG_KERNEL
+ depends on KVM_INTEL
+ help
+
+ Checks that KVM's page table management code will not incorrectly
+ let guests receive a virtualization exception. Virtualization
+ exceptions will be trapped by the hypervisor rather than injected
+ in the guest.
+
+ If unsure, say N.
+
config X86_SGX_KVM
bool "Software Guard eXtensions (SGX) Virtualization"
depends on X86_SGX && KVM_INTEL
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index addc44fc7187..5494669a055a 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -16,14 +16,15 @@ kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-$(CONFIG_KVM_SMM) += smm.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
- vmx/nested.o vmx/posted_intr.o
+ vmx/nested.o vmx/posted_intr.o vmx/main.o
kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o
-kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
- svm/sev.o
-kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o
+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o
+
+kvm-amd-$(CONFIG_KVM_AMD_SEV) += svm/sev.o
+kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o
ifdef CONFIG_HYPERV
kvm-y += kvm_onhyperv.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 77352a4abd87..1851b3870a9c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -772,7 +772,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
kvm_cpu_cap_mask(CPUID_8000_001F_EAX,
- 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
+ 0 /* SME */ | 0 /* SEV */ | 0 /* VM_PAGE_FLUSH */ | 0 /* SEV_ES */ |
F(SME_COHERENT));
kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 60f21bb4c27b..2343c9f00e31 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
*/
u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
- int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
+ int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1;
u32 errcode = PFERR_PRESENT_MASK;
bool fault;
@@ -234,8 +234,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
- offset = (pfec & ~1) +
- ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
+ offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0);
pkru_bits &= mmu->pkru_mask >> offset;
errcode |= -pkru_bits & PFERR_PK_MASK;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index db007a4dffa2..c39674d19260 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -432,8 +432,8 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
* The idea using the light way get the spte on x86_32 guest is from
* gup_get_pte (mm/gup.c).
*
- * An spte tlb flush may be pending, because kvm_set_pte_rmap
- * coalesces them and we are running out of the MMU lock. Therefore
+ * An spte tlb flush may be pending, because they are coalesced and
+ * we are running out of the MMU lock. Therefore
* we need to protect against in-progress updates of the spte.
*
* Reading the spte while an update is in progress may get the old value
@@ -567,9 +567,9 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
if (!is_shadow_present_pte(old_spte) ||
!spte_has_volatile_bits(old_spte))
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
else
- old_spte = __update_clear_spte_slow(sptep, 0ull);
+ old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
if (!is_shadow_present_pte(old_spte))
return old_spte;
@@ -603,7 +603,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
*/
static void mmu_spte_clear_no_track(u64 *sptep)
{
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
}
static u64 mmu_spte_get_lockless(u64 *sptep)
@@ -1448,49 +1448,11 @@ static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
}
static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t unused)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level)
{
return __kvm_zap_rmap(kvm, rmap_head, slot);
}
-static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t pte)
-{
- u64 *sptep;
- struct rmap_iterator iter;
- bool need_flush = false;
- u64 new_spte;
- kvm_pfn_t new_pfn;
-
- WARN_ON_ONCE(pte_huge(pte));
- new_pfn = pte_pfn(pte);
-
-restart:
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- need_flush = true;
-
- if (pte_write(pte)) {
- kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
- goto restart;
- } else {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(
- *sptep, new_pfn);
-
- mmu_spte_clear_track_bits(kvm, sptep);
- mmu_spte_set(sptep, new_spte);
- }
- }
-
- if (need_flush && kvm_available_flush_remote_tlbs_range()) {
- kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
- return false;
- }
-
- return need_flush;
-}
-
struct slot_rmap_walk_iterator {
/* input fields. */
const struct kvm_memory_slot *slot;
@@ -1562,7 +1524,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn,
- int level, pte_t pte);
+ int level);
static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
struct kvm_gfn_range *range,
@@ -1574,7 +1536,7 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
range->start, range->end - 1, &iterator)
ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
- iterator.level, range->arg.pte);
+ iterator.level);
return ret;
}
@@ -1596,22 +1558,8 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
return flush;
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- bool flush = false;
-
- if (kvm_memslots_have_rmaps(kvm))
- flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
-
- if (tdp_mmu_enabled)
- flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
-
- return flush;
-}
-
static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t unused)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1624,8 +1572,7 @@ static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
}
static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level, pte_t unused)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1950,7 +1897,8 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{
- if (!sp->spt[i])
+ /* sp->spt[i] has initial value of shadow page table allocation */
+ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE)
return 0;
return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
@@ -2514,7 +2462,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
return kvm_mmu_prepare_zap_page(kvm, child,
invalid_list);
}
- } else if (is_mmio_spte(pte)) {
+ } else if (is_mmio_spte(kvm, pte)) {
mmu_spte_clear_no_track(spte);
}
return 0;
@@ -3314,9 +3262,19 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
{
gva_t gva = fault->is_tdp ? 0 : fault->addr;
+ if (fault->is_private) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
access & shadow_mmio_access_mask);
+ fault->slot = NULL;
+ fault->pfn = KVM_PFN_NOSLOT;
+ fault->map_writable = false;
+ fault->hva = KVM_HVA_ERR_BAD;
+
/*
* If MMIO caching is disabled, emulate immediately without
* touching the shadow page tables as attempting to install an
@@ -4196,7 +4154,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
if (WARN_ON_ONCE(reserved))
return -EINVAL;
- if (is_mmio_spte(spte)) {
+ if (is_mmio_spte(vcpu->kvm, spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned int access = get_mmio_spte_access(spte);
@@ -4259,24 +4217,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
}
-static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- gfn_t gfn)
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
struct kvm_arch_async_pf arch;
arch.token = alloc_apf_token(vcpu);
- arch.gfn = gfn;
+ arch.gfn = fault->gfn;
+ arch.error_code = fault->error_code;
arch.direct_map = vcpu->arch.mmu->root_role.direct;
arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
- return kvm_setup_async_pf(vcpu, cr2_or_gpa,
- kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+ return kvm_setup_async_pf(vcpu, fault->addr,
+ kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
}
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;
+ if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
+ return;
+
if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
work->wakeup_all)
return;
@@ -4289,7 +4251,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
return;
- kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
}
static inline u8 kvm_max_level_for_order(int order)
@@ -4309,14 +4271,6 @@ static inline u8 kvm_max_level_for_order(int order)
return PG_LEVEL_4K;
}
-static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault)
-{
- kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
- PAGE_SIZE, fault->write, fault->exec,
- fault->is_private);
-}
-
static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
@@ -4343,48 +4297,15 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- struct kvm_memory_slot *slot = fault->slot;
bool async;
- /*
- * Retry the page fault if the gfn hit a memslot that is being deleted
- * or moved. This ensures any existing SPTEs for the old memslot will
- * be zapped before KVM inserts a new MMIO SPTE for the gfn.
- */
- if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
- return RET_PF_RETRY;
-
- if (!kvm_is_visible_memslot(slot)) {
- /* Don't expose private memslots to L2. */
- if (is_guest_mode(vcpu)) {
- fault->slot = NULL;
- fault->pfn = KVM_PFN_NOSLOT;
- fault->map_writable = false;
- return RET_PF_CONTINUE;
- }
- /*
- * If the APIC access page exists but is disabled, go directly
- * to emulation without caching the MMIO access or creating a
- * MMIO SPTE. That way the cache doesn't need to be purged
- * when the AVIC is re-enabled.
- */
- if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
- !kvm_apicv_activated(vcpu->kvm))
- return RET_PF_EMULATE;
- }
-
- if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
- kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
- return -EFAULT;
- }
-
if (fault->is_private)
return kvm_faultin_pfn_private(vcpu, fault);
async = false;
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
- fault->write, &fault->map_writable,
- &fault->hva);
+ fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false,
+ &async, fault->write,
+ &fault->map_writable, &fault->hva);
if (!async)
return RET_PF_CONTINUE; /* *pfn has correct page already */
@@ -4394,7 +4315,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
return RET_PF_RETRY;
- } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
+ } else if (kvm_arch_setup_async_pf(vcpu, fault)) {
return RET_PF_RETRY;
}
}
@@ -4404,17 +4325,72 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* to wait for IO. Note, gup always bails if it is unable to quickly
* get a page and a fatal signal, i.e. SIGKILL, is pending.
*/
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
- fault->write, &fault->map_writable,
- &fault->hva);
+ fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true,
+ NULL, fault->write,
+ &fault->map_writable, &fault->hva);
return RET_PF_CONTINUE;
}
static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
unsigned int access)
{
+ struct kvm_memory_slot *slot = fault->slot;
int ret;
+ /*
+ * Note that the mmu_invalidate_seq also serves to detect a concurrent
+ * change in attributes. is_page_fault_stale() will detect an
+ * invalidation relate to fault->fn and resume the guest without
+ * installing a mapping in the page tables.
+ */
+ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ /*
+ * Now that we have a snapshot of mmu_invalidate_seq we can check for a
+ * private vs. shared mismatch.
+ */
+ if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ if (unlikely(!slot))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * Retry the page fault if the gfn hit a memslot that is being deleted
+ * or moved. This ensures any existing SPTEs for the old memslot will
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+ */
+ if (slot->flags & KVM_MEMSLOT_INVALID)
+ return RET_PF_RETRY;
+
+ if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
+ /*
+ * Don't map L1's APIC access page into L2, KVM doesn't support
+ * using APICv/AVIC to accelerate L2 accesses to L1's APIC,
+ * i.e. the access needs to be emulated. Emulating access to
+ * L1's APIC is also correct if L1 is accelerating L2's own
+ * virtual APIC, but for some reason L1 also maps _L1's_ APIC
+ * into L2. Note, vcpu_is_mmio_gpa() always treats access to
+ * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM
+ * uses different roots for L1 vs. L2, i.e. there is no danger
+ * of breaking APICv/AVIC for L1.
+ */
+ if (is_guest_mode(vcpu))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * If the APIC access page exists but is disabled, go directly
+ * to emulation without caching the MMIO access or creating a
+ * MMIO SPTE. That way the cache doesn't need to be purged
+ * when the AVIC is re-enabled.
+ */
+ if (!kvm_apicv_activated(vcpu->kvm))
+ return RET_PF_EMULATE;
+ }
+
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
smp_rmb();
@@ -4439,8 +4415,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
* *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
* to detect retry guarantees the worst case latency for the vCPU.
*/
- if (fault->slot &&
- mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
+ if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
return RET_PF_RETRY;
ret = __kvm_faultin_pfn(vcpu, fault);
@@ -4450,7 +4425,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (unlikely(is_error_pfn(fault->pfn)))
return kvm_handle_error_pfn(vcpu, fault);
- if (unlikely(!fault->slot))
+ if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn)))
return kvm_handle_noslot_fault(vcpu, fault, access);
/*
@@ -4561,6 +4536,16 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
if (WARN_ON_ONCE(fault_address >> 32))
return -EFAULT;
#endif
+ /*
+ * Legacy #PF exception only have a 32-bit error code. Simply drop the
+ * upper bits as KVM doesn't use them for #PF (because they are never
+ * set), and to ensure there are no collisions with KVM-defined bits.
+ */
+ if (WARN_ON_ONCE(error_code >> 32))
+ error_code = lower_32_bits(error_code);
+
+ /* Ensure the above sanity check also covers KVM-defined flags. */
+ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
vcpu->arch.l1tf_flush_l1d = true;
if (!flags) {
@@ -4812,7 +4797,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned int access)
{
- if (unlikely(is_mmio_spte(*sptep))) {
+ if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
mmu_spte_clear_no_track(sptep);
return true;
@@ -5846,30 +5831,35 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
int r, emulation_type = EMULTYPE_PF;
bool direct = vcpu->arch.mmu->root_role.direct;
- /*
- * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
- * checks when emulating instructions that triggers implicit access.
- * WARN if hardware generates a fault with an error code that collides
- * with the KVM-defined value. Clear the flag and continue on, i.e.
- * don't terminate the VM, as KVM can't possibly be relying on a flag
- * that KVM doesn't know about.
- */
- if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
- error_code &= ~PFERR_IMPLICIT_ACCESS;
-
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
+ /*
+ * Except for reserved faults (emulated MMIO is shared-only), set the
+ * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
+ * current attributes, which are the source of truth for such VMs. Note,
+ * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
+ * currently supported nested virtualization (among many other things)
+ * for software-protected VMs.
+ */
+ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
+ !(error_code & PFERR_RSVD_MASK) &&
+ vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS))
+ return -EFAULT;
+
r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
if (r == RET_PF_EMULATE)
goto emulate;
}
if (r == RET_PF_INVALID) {
- r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
- lower_32_bits(error_code), false,
+ r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
&emulation_type);
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
@@ -6173,7 +6163,10 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
- vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+ vcpu->arch.mmu_shadow_page_cache.init_value =
+ SHADOW_NONPRESENT_VALUE;
+ if (!vcpu->arch.mmu_shadow_page_cache.init_value)
+ vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@ -6316,6 +6309,7 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
void kvm_mmu_init_vm(struct kvm *kvm)
{
+ kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 5390a591a571..ce2fcd19ba6b 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -190,7 +190,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
struct kvm_page_fault {
/* arguments to kvm_mmu_do_page_fault. */
const gpa_t addr;
- const u32 error_code;
+ const u64 error_code;
const bool prefetch;
/* Derived from error_code. */
@@ -279,8 +279,16 @@ enum {
RET_PF_SPURIOUS,
};
+static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+ PAGE_SIZE, fault->write, fault->exec,
+ fault->is_private);
+}
+
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 err, bool prefetch, int *emulation_type)
+ u64 err, bool prefetch, int *emulation_type)
{
struct kvm_page_fault fault = {
.addr = cr2_or_gpa,
@@ -298,7 +306,10 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
.max_level = KVM_MAX_HUGEPAGE_LEVEL,
.req_level = PG_LEVEL_4K,
.goal_level = PG_LEVEL_4K,
- .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
+ .is_private = err & PFERR_PRIVATE_ACCESS,
+
+ .pfn = KVM_PFN_ERR_FAULT,
+ .hva = KVM_HVA_ERR_BAD,
};
int r;
@@ -320,6 +331,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
else
r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+ /*
+ * Not sure what's happening, but punt to userspace and hope that
+ * they can fix it by changing memory to shared, or they can
+ * provide a better error.
+ */
+ if (r == RET_PF_EMULATE && fault.is_private) {
+ pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
+ kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
+ return -EFAULT;
+ }
+
if (fault.write_fault_to_shadow_pgtable && emulation_type)
*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index ae86820cef69..195d98bc8de8 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -260,7 +260,7 @@ TRACE_EVENT(
TP_STRUCT__entry(
__field(int, vcpu_id)
__field(gpa_t, cr2_or_gpa)
- __field(u32, error_code)
+ __field(u64, error_code)
__field(u64 *, sptep)
__field(u64, old_spte)
__field(u64, new_spte)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 4d4e98fe4f35..9aac3aa93d88 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -911,7 +911,7 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
gpa_t pte_gpa;
gfn_t gfn;
- if (WARN_ON_ONCE(!sp->spt[i]))
+ if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE))
return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
@@ -933,13 +933,13 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
return 0;
/*
- * Drop the SPTE if the new protections would result in a RWX=0
- * SPTE or if the gfn is changing. The RWX=0 case only affects
- * EPT with execute-only support, i.e. EPT without an effective
- * "present" bit, as all other paging modes will create a
- * read-only SPTE if pte_access is zero.
+ * Drop the SPTE if the new protections result in no effective
+ * "present" bit or if the gfn is changing. The former case
+ * only affects EPT with execute-only support with pte_access==0;
+ * all other paging modes will create a read-only SPTE if
+ * pte_access is zero.
*/
- if ((!pte_access && !shadow_present_mask) ||
+ if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE ||
gfn != kvm_mmu_page_get_gfn(sp, i)) {
drop_spte(vcpu->kvm, &sp->spt[i]);
return 1;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 4a599130e9c9..a5e014d7bc62 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -74,10 +74,10 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
u64 spte = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
- WARN_ON_ONCE(!shadow_mmio_value);
+ WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value);
access &= shadow_mmio_access_mask;
- spte |= shadow_mmio_value | access;
+ spte |= vcpu->kvm->arch.shadow_mmio_value | access;
spte |= gpa | shadow_nonpresent_or_rsvd_mask;
spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
@@ -144,19 +144,19 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 spte = SPTE_MMU_PRESENT_MASK;
bool wrprot = false;
- WARN_ON_ONCE(!pte_access && !shadow_present_mask);
+ /*
+ * For the EPT case, shadow_present_mask has no RWX bits set if
+ * exec-only page table entries are supported. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE);
if (sp->role.ad_disabled)
spte |= SPTE_TDP_AD_DISABLED;
else if (kvm_mmu_page_ad_need_write_protect(sp))
spte |= SPTE_TDP_AD_WRPROT_ONLY;
- /*
- * For the EPT case, shadow_present_mask is 0 if hardware
- * supports exec-only page table entries. In that case,
- * ACC_USER_MASK and shadow_user_mask are used to represent
- * read access. See FNAME(gpte_access) in paging_tmpl.h.
- */
spte |= shadow_present_mask;
if (!prefetch)
spte |= spte_shadow_accessed_mask(spte);
@@ -322,22 +322,6 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
return spte;
}
-u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
-{
- u64 new_spte;
-
- new_spte = old_spte & ~SPTE_BASE_ADDR_MASK;
- new_spte |= (u64)new_pfn << PAGE_SHIFT;
-
- new_spte &= ~PT_WRITABLE_MASK;
- new_spte &= ~shadow_host_writable_mask;
- new_spte &= ~shadow_mmu_writable_mask;
-
- new_spte = mark_spte_for_access_track(new_spte);
-
- return new_spte;
-}
-
u64 mark_spte_for_access_track(u64 spte)
{
if (spte_ad_enabled(spte))
@@ -429,7 +413,9 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
shadow_nx_mask = 0ull;
shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
- shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+ /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
+ shadow_present_mask =
+ (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
/*
* EPT overrides the host MTRRs, and so KVM must program the desired
* memtype directly into the SPTEs. Note, this mask is just the mask
@@ -446,7 +432,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
* of an EPT paging-structure entry is 110b (write/execute).
*/
kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
- VMX_EPT_RWX_MASK, 0);
+ VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0);
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index a129951c9a88..5dd5405fa07a 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -149,6 +149,22 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+/*
+ * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress
+ * #VE and get EPT violations on non-present PTEs. We can use the
+ * same value also without TDX for both VMX and SVM:
+ *
+ * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored.
+ * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0)
+ * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1)
+ */
+#ifdef CONFIG_X86_64
+#define SHADOW_NONPRESENT_VALUE BIT_ULL(63)
+static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK));
+#else
+#define SHADOW_NONPRESENT_VALUE 0ULL
+#endif
+
extern u64 __read_mostly shadow_host_writable_mask;
extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
@@ -190,11 +206,11 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*
* Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
* both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
- * vulnerability. Use only low bits to avoid 64-bit immediates.
+ * vulnerability.
*
* Only used by the TDP MMU.
*/
-#define REMOVED_SPTE 0x5a0ULL
+#define REMOVED_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
@@ -249,9 +265,9 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
return spte_to_child_sp(root);
}
-static inline bool is_mmio_spte(u64 spte)
+static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
{
- return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+ return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
likely(enable_mmio_caching);
}
@@ -496,8 +512,6 @@ static inline u64 restore_acc_track_spte(u64 spte)
return spte;
}
-u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn);
-
void __init kvm_mmu_spte_module_init(void);
void kvm_mmu_reset_all_pte_masks(void);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 04c1f0957fea..b5be7e949bd8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -495,8 +495,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* impact the guest since both the former and current SPTEs
* are nonpresent.
*/
- if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
- !is_mmio_spte(new_spte) &&
+ if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
+ !is_mmio_spte(kvm, new_spte) &&
!is_removed_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
@@ -603,7 +603,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* here since the SPTE is going from non-present to non-present. Use
* the raw write helper to avoid an unnecessary check on volatile bits.
*/
- __kvm_tdp_mmu_write_spte(iter->sptep, 0);
+ __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
return 0;
}
@@ -740,8 +740,8 @@ retry:
continue;
if (!shared)
- tdp_mmu_iter_set_spte(kvm, &iter, 0);
- else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+ else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
goto retry;
}
}
@@ -808,8 +808,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
return false;
- tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
- sp->gfn, sp->role.level + 1);
+ tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
+ SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
return true;
}
@@ -843,7 +843,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_iter_set_spte(kvm, &iter, 0);
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
/*
* Zappings SPTEs in invalid roots doesn't require a TLB flush,
@@ -1028,7 +1028,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
}
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
- if (unlikely(is_mmio_spte(new_spte))) {
+ if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
vcpu->stat.pf_mmio_spte_created++;
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
@@ -1258,52 +1258,6 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
}
-static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
- struct kvm_gfn_range *range)
-{
- u64 new_spte;
-
- /* Huge pages aren't expected to be modified without first being zapped. */
- WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
-
- if (iter->level != PG_LEVEL_4K ||
- !is_shadow_present_pte(iter->old_spte))
- return false;
-
- /*
- * Note, when changing a read-only SPTE, it's not strictly necessary to
- * zero the SPTE before setting the new PFN, but doing so preserves the
- * invariant that the PFN of a present * leaf SPTE can never change.
- * See handle_changed_spte().
- */
- tdp_mmu_iter_set_spte(kvm, iter, 0);
-
- if (!pte_write(range->arg.pte)) {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
- pte_pfn(range->arg.pte));
-
- tdp_mmu_iter_set_spte(kvm, iter, new_spte);
- }
-
- return true;
-}
-
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- /*
- * No need to handle the remote TLB flush under RCU protection, the
- * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
- * shadow page. See the WARN on pfn_changed in handle_changed_spte().
- */
- return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
-}
-
/*
* Remove write access from all SPTEs at or above min_level that map GFNs
* [start, end). Returns true if an SPTE has been changed and the TLBs need to
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 6e1ea04ca885..58b55e61bd33 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -31,7 +31,6 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush);
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
-bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot, int min_level);
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 759581bb2128..0623cfaa7bb0 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -23,6 +23,7 @@
#include <asm/pkru.h>
#include <asm/trapnr.h>
#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
#include "mmu.h"
@@ -32,22 +33,12 @@
#include "cpuid.h"
#include "trace.h"
-#ifndef CONFIG_KVM_AMD_SEV
-/*
- * When this config is not defined, SEV feature is not supported and APIs in
- * this file are not used but this file still gets compiled into the KVM AMD
- * module.
- *
- * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
- * misc_res_type {} defined in linux/misc_cgroup.h.
- *
- * Below macros allow compilation to succeed.
- */
-#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
-#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
-#endif
+#define GHCB_VERSION_MAX 2ULL
+#define GHCB_VERSION_DEFAULT 2ULL
+#define GHCB_VERSION_MIN 1ULL
+
+#define GHCB_HV_FT_SUPPORTED GHCB_HV_FT_SNP
-#ifdef CONFIG_KVM_AMD_SEV
/* enable/disable SEV support */
static bool sev_enabled = true;
module_param_named(sev, sev_enabled, bool, 0444);
@@ -57,13 +48,13 @@ static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);
/* enable/disable SEV-ES DebugSwap support */
-static bool sev_es_debug_swap_enabled = false;
+static bool sev_es_debug_swap_enabled = true;
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
-#else
-#define sev_enabled false
-#define sev_es_enabled false
-#define sev_es_debug_swap_enabled false
-#endif /* CONFIG_KVM_AMD_SEV */
+static u64 sev_supported_vmsa_features;
+
+#define AP_RESET_HOLD_NONE 0
+#define AP_RESET_HOLD_NAE_EVENT 1
+#define AP_RESET_HOLD_MSR_PROTO 2
static u8 sev_enc_bit;
static DECLARE_RWSEM(sev_deactivate_lock);
@@ -113,7 +104,15 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
static inline bool is_mirroring_enc_context(struct kvm *kvm)
{
- return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+ return !!to_kvm_sev_info(kvm)->enc_context_owner;
+}
+
+static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
+ return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
}
/* Must be called with the sev_bitmap_lock held */
@@ -251,20 +250,44 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
sev_decommission(handle);
}
-static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_init *data,
+ unsigned long vm_type)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_platform_init_args init_args = {0};
+ bool es_active = vm_type != KVM_X86_SEV_VM;
+ u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0;
int ret;
if (kvm->created_vcpus)
return -EINVAL;
+ if (data->flags)
+ return -EINVAL;
+
+ if (data->vmsa_features & ~valid_vmsa_features)
+ return -EINVAL;
+
+ if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version))
+ return -EINVAL;
+
if (unlikely(sev->active))
return -EINVAL;
sev->active = true;
- sev->es_active = argp->id == KVM_SEV_ES_INIT;
+ sev->es_active = es_active;
+ sev->vmsa_features = data->vmsa_features;
+ sev->ghcb_version = data->ghcb_version;
+
+ /*
+ * Currently KVM supports the full range of mandatory features defined
+ * by version 2 of the GHCB protocol, so default to that for SEV-ES
+ * guests created via KVM_SEV_INIT2.
+ */
+ if (sev->es_active && !sev->ghcb_version)
+ sev->ghcb_version = GHCB_VERSION_DEFAULT;
+
ret = sev_asid_new(sev);
if (ret)
goto e_no_asid;
@@ -276,6 +299,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
+ sev->need_init = false;
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
@@ -286,11 +310,53 @@ e_free:
sev_asid_free(sev);
sev->asid = 0;
e_no_asid:
+ sev->vmsa_features = 0;
sev->es_active = false;
sev->active = false;
return ret;
}
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_init data = {
+ .vmsa_features = 0,
+ .ghcb_version = 0,
+ };
+ unsigned long vm_type;
+
+ if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM)
+ return -EINVAL;
+
+ vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM);
+
+ /*
+ * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will
+ * continue to only ever support the minimal GHCB protocol version.
+ */
+ if (vm_type == KVM_X86_SEV_ES_VM)
+ data.ghcb_version = GHCB_VERSION_MIN;
+
+ return __sev_guest_init(kvm, argp, &data, vm_type);
+}
+
+static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_init data;
+
+ if (!sev->need_init)
+ return -EINVAL;
+
+ if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
+ kvm->arch.vm_type != KVM_X86_SEV_ES_VM)
+ return -EINVAL;
+
+ if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
+ return -EFAULT;
+
+ return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type);
+}
+
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
unsigned int asid = sev_get_asid(kvm);
@@ -339,7 +405,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
memset(&start, 0, sizeof(start));
@@ -383,7 +449,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* return handle to userspace */
params.handle = start.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) {
sev_unbind_asid(kvm, start.handle);
ret = -EFAULT;
goto e_free_session;
@@ -522,7 +588,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
vaddr = params.uaddr;
@@ -580,7 +646,13 @@ e_unpin:
static int sev_es_sync_vmsa(struct vcpu_svm *svm)
{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
struct sev_es_save_area *save = svm->sev_es.vmsa;
+ struct xregs_state *xsave;
+ const u8 *s;
+ u8 *d;
+ int i;
/* Check some debug related fields before encrypting the VMSA */
if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
@@ -621,10 +693,44 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xss = svm->vcpu.arch.ia32_xss;
save->dr6 = svm->vcpu.arch.dr6;
- if (sev_es_debug_swap_enabled) {
- save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
- pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. "
- "This will not work starting with Linux 6.10\n");
+ save->sev_features = sev->vmsa_features;
+
+ /*
+ * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
+ * breaking older measurements.
+ */
+ if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) {
+ xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave;
+ save->x87_dp = xsave->i387.rdp;
+ save->mxcsr = xsave->i387.mxcsr;
+ save->x87_ftw = xsave->i387.twd;
+ save->x87_fsw = xsave->i387.swd;
+ save->x87_fcw = xsave->i387.cwd;
+ save->x87_fop = xsave->i387.fop;
+ save->x87_ds = 0;
+ save->x87_cs = 0;
+ save->x87_rip = xsave->i387.rip;
+
+ for (i = 0; i < 8; i++) {
+ /*
+ * The format of the x87 save area is undocumented and
+ * definitely not what you would expect. It consists of
+ * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes
+ * area with bytes 8-9 of each register.
+ */
+ d = save->fpreg_x87 + i * 8;
+ s = ((u8 *)xsave->i387.st_space) + i * 16;
+ memcpy(d, s, 8);
+ save->fpreg_x87[64 + i * 2] = s[8];
+ save->fpreg_x87[64 + i * 2 + 1] = s[9];
+ }
+ memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256);
+
+ s = get_xsave_addr(xsave, XFEATURE_YMM);
+ if (s)
+ memcpy(save->fpreg_ymm, s, 256);
+ else
+ memset(save->fpreg_ymm, 0, 256);
}
pr_debug("Virtual Machine Save Area (VMSA):\n");
@@ -658,13 +764,20 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
vmsa.reserved = 0;
- vmsa.handle = to_kvm_svm(kvm)->sev_info.handle;
+ vmsa.handle = to_kvm_sev_info(kvm)->handle;
vmsa.address = __sme_pa(svm->sev_es.vmsa);
vmsa.len = PAGE_SIZE;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
if (ret)
return ret;
+ /*
+ * SEV-ES guests maintain an encrypted version of their FPU
+ * state which is restored and saved on VMRUN and VMEXIT.
+ * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
+ * do xsave/xrstor on it.
+ */
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
vcpu->arch.guest_state_protected = true;
return 0;
}
@@ -695,7 +808,7 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- void __user *measure = (void __user *)(uintptr_t)argp->data;
+ void __user *measure = u64_to_user_ptr(argp->data);
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_measure data;
struct kvm_sev_launch_measure params;
@@ -715,7 +828,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!params.len)
goto cmd;
- p = (void __user *)(uintptr_t)params.uaddr;
+ p = u64_to_user_ptr(params.uaddr);
if (p) {
if (params.len > SEV_FW_BLOB_MAX_SIZE)
return -EINVAL;
@@ -788,7 +901,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
params.state = data.state;
params.handle = data.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
ret = -EFAULT;
return ret;
@@ -953,7 +1066,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+ if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug)))
return -EFAULT;
if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
@@ -1037,7 +1150,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
@@ -1101,7 +1214,7 @@ e_unpin_memory:
static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- void __user *report = (void __user *)(uintptr_t)argp->data;
+ void __user *report = u64_to_user_ptr(argp->data);
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_attestation_report data;
struct kvm_sev_attestation_report params;
@@ -1112,7 +1225,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
memset(&data, 0, sizeof(data));
@@ -1121,7 +1234,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!params.len)
goto cmd;
- p = (void __user *)(uintptr_t)params.uaddr;
+ p = u64_to_user_ptr(params.uaddr);
if (p) {
if (params.len > SEV_FW_BLOB_MAX_SIZE)
return -EINVAL;
@@ -1174,7 +1287,7 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
params->session_len = data.session_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
sizeof(struct kvm_sev_send_start)))
ret = -EFAULT;
@@ -1193,7 +1306,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_send_start)))
return -EFAULT;
@@ -1248,7 +1361,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
- if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+ if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr),
session_data, params.session_len)) {
ret = -EFAULT;
goto e_free_amd_cert;
@@ -1256,7 +1369,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
params.policy = data.policy;
params.session_len = data.session_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params,
sizeof(struct kvm_sev_send_start)))
ret = -EFAULT;
@@ -1287,7 +1400,7 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
params->hdr_len = data.hdr_len;
params->trans_len = data.trans_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
sizeof(struct kvm_sev_send_update_data)))
ret = -EFAULT;
@@ -1307,7 +1420,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_send_update_data)))
return -EFAULT;
@@ -1358,14 +1471,14 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free_trans_data;
/* copy transport buffer to user space */
- if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+ if (copy_to_user(u64_to_user_ptr(params.trans_uaddr),
trans_data, params.trans_len)) {
ret = -EFAULT;
goto e_free_trans_data;
}
/* Copy packet header to userspace. */
- if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+ if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr,
params.hdr_len))
ret = -EFAULT;
@@ -1417,7 +1530,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -ENOTTY;
/* Get parameter from the userspace */
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_receive_start)))
return -EFAULT;
@@ -1459,7 +1572,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
}
params.handle = start.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data,
+ if (copy_to_user(u64_to_user_ptr(argp->data),
&params, sizeof(struct kvm_sev_receive_start))) {
ret = -EFAULT;
sev_unbind_asid(kvm, start.handle);
@@ -1490,7 +1603,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -EINVAL;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_receive_update_data)))
return -EFAULT;
@@ -1705,6 +1818,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
dst->pages_locked = src->pages_locked;
dst->enc_context_owner = src->enc_context_owner;
dst->es_active = src->es_active;
+ dst->vmsa_features = src->vmsa_features;
src->asid = 0;
src->active = false;
@@ -1812,7 +1926,8 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
if (ret)
goto out_fput;
- if (sev_guest(kvm) || !sev_guest(source_kvm)) {
+ if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
+ sev_guest(kvm) || !sev_guest(source_kvm)) {
ret = -EINVAL;
goto out_unlock;
}
@@ -1861,6 +1976,21 @@ out_fput:
return ret;
}
+int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
+{
+ if (group != KVM_X86_GRP_SEV)
+ return -ENXIO;
+
+ switch (attr) {
+ case KVM_X86_SEV_VMSA_FEATURES:
+ *val = sev_supported_vmsa_features;
+ return 0;
+
+ default:
+ return -ENXIO;
+ }
+}
+
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -1894,6 +2024,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd);
break;
+ case KVM_SEV_INIT2:
+ r = sev_guest_init2(kvm, &sev_cmd);
+ break;
case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd);
break;
@@ -2121,6 +2254,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
mirror_sev->asid = source_sev->asid;
mirror_sev->fd = source_sev->fd;
mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->need_init = false;
mirror_sev->handle = source_sev->handle;
INIT_LIST_HEAD(&mirror_sev->regions_list);
INIT_LIST_HEAD(&mirror_sev->mirror_vms);
@@ -2186,15 +2320,18 @@ void sev_vm_destroy(struct kvm *kvm)
void __init sev_set_cpu_caps(void)
{
- if (!sev_enabled)
- kvm_cpu_cap_clear(X86_FEATURE_SEV);
- if (!sev_es_enabled)
- kvm_cpu_cap_clear(X86_FEATURE_SEV_ES);
+ if (sev_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
+ }
+ if (sev_es_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
+ }
}
void __init sev_hardware_setup(void)
{
-#ifdef CONFIG_KVM_AMD_SEV
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
bool sev_es_supported = false;
bool sev_supported = false;
@@ -2294,7 +2431,10 @@ out:
if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
!cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
sev_es_debug_swap_enabled = false;
-#endif
+
+ sev_supported_vmsa_features = 0;
+ if (sev_es_debug_swap_enabled)
+ sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
}
void sev_hardware_unsetup(void)
@@ -2585,6 +2725,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
case SVM_VMGEXIT_AP_HLT_LOOP:
case SVM_VMGEXIT_AP_JUMP_TABLE:
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ case SVM_VMGEXIT_HV_FEATURES:
+ case SVM_VMGEXIT_TERM_REQUEST:
break;
default:
reason = GHCB_ERR_INVALID_EVENT;
@@ -2615,6 +2757,9 @@ vmgexit_err:
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
{
+ /* Clear any indication that the vCPU is in a type of AP Reset Hold */
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
+
if (!svm->sev_es.ghcb)
return;
@@ -2774,6 +2919,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
u64 ghcb_info;
int ret = 1;
@@ -2784,7 +2930,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
switch (ghcb_info) {
case GHCB_MSR_SEV_INFO_REQ:
- set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
break;
@@ -2826,6 +2972,28 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
GHCB_MSR_INFO_POS);
break;
}
+ case GHCB_MSR_AP_RESET_HOLD_REQ:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
+ ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+
+ /*
+ * Preset the result to a non-SIPI return and then only set
+ * the result to non-zero when delivering a SIPI.
+ */
+ set_ghcb_msr_bits(svm, 0,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_HV_FT_REQ:
+ set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
+ GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
+ GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
+ break;
case GHCB_MSR_TERM_REQ: {
u64 reason_set, reason_code;
@@ -2925,6 +3093,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ret = 1;
break;
case SVM_VMGEXIT_AP_HLT_LOOP:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
ret = kvm_emulate_ap_reset_hold(vcpu);
break;
case SVM_VMGEXIT_AP_JUMP_TABLE: {
@@ -2949,6 +3118,19 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ret = 1;
break;
}
+ case SVM_VMGEXIT_HV_FEATURES:
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_HV_FT_SUPPORTED);
+
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_TERM_REQUEST:
+ pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+ break;
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
@@ -3063,7 +3245,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
svm_set_intercept(svm, TRAP_CR8_WRITE);
vmcb->control.intercepts[INTERCEPT_DR] = 0;
- if (!sev_es_debug_swap_enabled) {
+ if (!sev_vcpu_has_debug_swap(svm)) {
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
@@ -3109,16 +3291,19 @@ void sev_init_vmcb(struct vcpu_svm *svm)
void sev_es_vcpu_reset(struct vcpu_svm *svm)
{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
/*
* Set the GHCB MSR value as per the GHCB specification when emulating
* vCPU RESET for an SEV-ES guest.
*/
- set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
}
-void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
+void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
{
/*
* All host state for SEV-ES guests is categorized into three swap types
@@ -3146,7 +3331,7 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
* saves and loads debug registers (Type-A).
*/
- if (sev_es_debug_swap_enabled) {
+ if (sev_vcpu_has_debug_swap(svm)) {
hostsa->dr0 = native_get_debugreg(0);
hostsa->dr1 = native_get_debugreg(1);
hostsa->dr2 = native_get_debugreg(2);
@@ -3168,15 +3353,31 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
return;
}
- /*
- * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where
- * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
- * non-zero value.
- */
- if (!svm->sev_es.ghcb)
- return;
+ /* Subsequent SIPI */
+ switch (svm->sev_es.ap_reset_hold_type) {
+ case AP_RESET_HOLD_NAE_EVENT:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
+ */
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+ break;
+ case AP_RESET_HOLD_MSR_PROTO:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set GHCB data field to a non-zero value.
+ */
+ set_ghcb_msr_bits(svm, 1,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ default:
+ break;
+ }
}
struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9aaf83c8d57d..c8dc25886c16 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1433,14 +1433,6 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
vmsa_page = snp_safe_alloc_page(vcpu);
if (!vmsa_page)
goto error_free_vmcb_page;
-
- /*
- * SEV-ES guests maintain an encrypted version of their FPU
- * state which is restored and saved on VMRUN and VMEXIT.
- * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
- * do xsave/xrstor on it.
- */
- fpstate_set_confidential(&vcpu->arch.guest_fpu);
}
err = avic_init_vcpu(svm);
@@ -1525,7 +1517,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
*/
vmsave(sd->save_area_pa);
if (sev_es_guest(vcpu->kvm))
- sev_es_prepare_switch_to_guest(sev_es_host_save_area(sd));
+ sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd));
if (tsc_scaling)
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
@@ -2056,6 +2048,15 @@ static int npf_interception(struct kvm_vcpu *vcpu)
u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;
+ /*
+ * WARN if hardware generates a fault with an error code that collides
+ * with KVM-defined sythentic flags. Clear the flags and continue on,
+ * i.e. don't terminate the VM, as KVM can't possibly be relying on a
+ * flag that KVM doesn't know about.
+ */
+ if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
+ error_code &= ~PFERR_SYNTHETIC_MASK;
+
trace_kvm_page_fault(vcpu, fault_address, error_code);
return kvm_mmu_page_fault(vcpu, fault_address, error_code,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
@@ -3304,7 +3305,9 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
+#ifdef CONFIG_KVM_AMD_SEV
[SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
+#endif
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -4085,6 +4088,9 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
+ if (to_kvm_sev_info(vcpu->kvm)->need_init)
+ return -EINVAL;
+
return 1;
}
@@ -4892,6 +4898,14 @@ static void svm_vm_destroy(struct kvm *kvm)
static int svm_vm_init(struct kvm *kvm)
{
+ int type = kvm->arch.vm_type;
+
+ if (type != KVM_X86_DEFAULT_VM &&
+ type != KVM_X86_SW_PROTECTED_VM) {
+ kvm->arch.has_protected_state = (type == KVM_X86_SEV_ES_VM);
+ to_kvm_sev_info(kvm)->need_init = true;
+ }
+
if (!pause_filter_count || !pause_filter_thresh)
kvm->arch.pause_in_guest = true;
@@ -5026,6 +5040,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_smi_window = svm_enable_smi_window,
#endif
+#ifdef CONFIG_KVM_AMD_SEV
+ .dev_get_attr = sev_dev_get_attr,
.mem_enc_ioctl = sev_mem_enc_ioctl,
.mem_enc_register_region = sev_mem_enc_register_region,
.mem_enc_unregister_region = sev_mem_enc_unregister_region,
@@ -5033,7 +5049,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
.vm_move_enc_context_from = sev_vm_move_enc_context_from,
-
+#endif
.check_emulate_instruction = svm_check_emulate_instruction,
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 33878efdebc8..be57213cd295 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -79,12 +79,15 @@ enum {
struct kvm_sev_info {
bool active; /* SEV enabled guest */
bool es_active; /* SEV-ES enabled guest */
+ bool need_init; /* waiting for SEV_INIT2 */
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
+ u64 vmsa_features;
+ u16 ghcb_version; /* Highest guest GHCB protocol version allowed */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
struct list_head mirror_vms; /* List of VMs mirroring */
struct list_head mirror_entry; /* Use as a list entry of mirrors */
@@ -197,6 +200,7 @@ struct vcpu_sev_es_state {
u8 valid_bitmap[16];
struct kvm_host_map ghcb_map;
bool received_first_sipi;
+ unsigned int ap_reset_hold_type;
/* SEV-ES scratch area support */
u64 sw_scratch;
@@ -318,6 +322,11 @@ static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
return container_of(kvm, struct kvm_svm, kvm);
}
+static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm)
+{
+ return &to_kvm_svm(kvm)->sev_info;
+}
+
static __always_inline bool sev_guest(struct kvm *kvm)
{
#ifdef CONFIG_KVM_AMD_SEV
@@ -664,13 +673,16 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
/* sev.c */
-#define GHCB_VERSION_MAX 1ULL
-#define GHCB_VERSION_MIN 1ULL
-
-
-extern unsigned int max_sev_asid;
+void pre_sev_run(struct vcpu_svm *svm, int cpu);
+void sev_init_vmcb(struct vcpu_svm *svm);
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
+void sev_es_vcpu_reset(struct vcpu_svm *svm);
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa);
+void sev_es_unmap_ghcb(struct vcpu_svm *svm);
-void sev_vm_destroy(struct kvm *kvm);
+#ifdef CONFIG_KVM_AMD_SEV
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp);
int sev_mem_enc_register_region(struct kvm *kvm,
struct kvm_enc_region *range);
@@ -679,22 +691,32 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
void sev_guest_memory_reclaimed(struct kvm *kvm);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
-void pre_sev_run(struct vcpu_svm *svm, int cpu);
+/* These symbols are used in common code and are stubbed below. */
+struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
+void sev_free_vcpu(struct kvm_vcpu *vcpu);
+void sev_vm_destroy(struct kvm *kvm);
void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);
void sev_hardware_unsetup(void);
int sev_cpu_init(struct svm_cpu_data *sd);
-void sev_init_vmcb(struct vcpu_svm *svm);
-void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
-void sev_free_vcpu(struct kvm_vcpu *vcpu);
-int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
-int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
-void sev_es_vcpu_reset(struct vcpu_svm *svm);
-void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
-void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
-void sev_es_unmap_ghcb(struct vcpu_svm *svm);
-struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
+int sev_dev_get_attr(u32 group, u64 attr, u64 *val);
+extern unsigned int max_sev_asid;
+#else
+static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) {
+ return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+}
+
+static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {}
+static inline void sev_vm_destroy(struct kvm *kvm) {}
+static inline void __init sev_set_cpu_caps(void) {}
+static inline void __init sev_hardware_setup(void) {}
+static inline void sev_hardware_unsetup(void) {}
+static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; }
+static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; }
+#define max_sev_asid 0
+#endif
/* vmenter.S */
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
new file mode 100644
index 000000000000..7c546ad3e4c9
--- /dev/null
+++ b/arch/x86/kvm/vmx/main.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/moduleparam.h>
+
+#include "x86_ops.h"
+#include "vmx.h"
+#include "nested.h"
+#include "pmu.h"
+
+#define VMX_REQUIRED_APICV_INHIBITS \
+ (BIT(APICV_INHIBIT_REASON_DISABLE)| \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED))
+
+struct kvm_x86_ops vt_x86_ops __initdata = {
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
+
+ .hardware_unsetup = vmx_hardware_unsetup,
+
+ .hardware_enable = vmx_hardware_enable,
+ .hardware_disable = vmx_hardware_disable,
+ .has_emulated_msr = vmx_has_emulated_msr,
+
+ .vm_size = sizeof(struct kvm_vmx),
+ .vm_init = vmx_vm_init,
+ .vm_destroy = vmx_vm_destroy,
+
+ .vcpu_precreate = vmx_vcpu_precreate,
+ .vcpu_create = vmx_vcpu_create,
+ .vcpu_free = vmx_vcpu_free,
+ .vcpu_reset = vmx_vcpu_reset,
+
+ .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
+ .update_exception_bitmap = vmx_update_exception_bitmap,
+ .get_msr_feature = vmx_get_msr_feature,
+ .get_msr = vmx_get_msr,
+ .set_msr = vmx_set_msr,
+ .get_segment_base = vmx_get_segment_base,
+ .get_segment = vmx_get_segment,
+ .set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
+ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .is_valid_cr0 = vmx_is_valid_cr0,
+ .set_cr0 = vmx_set_cr0,
+ .is_valid_cr4 = vmx_is_valid_cr4,
+ .set_cr4 = vmx_set_cr4,
+ .set_efer = vmx_set_efer,
+ .get_idt = vmx_get_idt,
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+ .get_rflags = vmx_get_rflags,
+ .set_rflags = vmx_set_rflags,
+ .get_if_flag = vmx_get_if_flag,
+
+ .flush_tlb_all = vmx_flush_tlb_all,
+ .flush_tlb_current = vmx_flush_tlb_current,
+ .flush_tlb_gva = vmx_flush_tlb_gva,
+ .flush_tlb_guest = vmx_flush_tlb_guest,
+
+ .vcpu_pre_run = vmx_vcpu_pre_run,
+ .vcpu_run = vmx_vcpu_run,
+ .handle_exit = vmx_handle_exit,
+ .skip_emulated_instruction = vmx_skip_emulated_instruction,
+ .update_emulated_instruction = vmx_update_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
+ .patch_hypercall = vmx_patch_hypercall,
+ .inject_irq = vmx_inject_irq,
+ .inject_nmi = vmx_inject_nmi,
+ .inject_exception = vmx_inject_exception,
+ .cancel_injection = vmx_cancel_injection,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .get_nmi_mask = vmx_get_nmi_mask,
+ .set_nmi_mask = vmx_set_nmi_mask,
+ .enable_nmi_window = vmx_enable_nmi_window,
+ .enable_irq_window = vmx_enable_irq_window,
+ .update_cr8_intercept = vmx_update_cr8_intercept,
+ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
+ .hwapic_irr_update = vmx_hwapic_irr_update,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
+ .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
+ .sync_pir_to_irr = vmx_sync_pir_to_irr,
+ .deliver_interrupt = vmx_deliver_interrupt,
+ .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
+
+ .set_tss_addr = vmx_set_tss_addr,
+ .set_identity_map_addr = vmx_set_identity_map_addr,
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vmx_get_exit_info,
+
+ .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
+ .write_tsc_offset = vmx_write_tsc_offset,
+ .write_tsc_multiplier = vmx_write_tsc_multiplier,
+
+ .load_mmu_pgd = vmx_load_mmu_pgd,
+
+ .check_intercept = vmx_check_intercept,
+ .handle_exit_irqoff = vmx_handle_exit_irqoff,
+
+ .sched_in = vmx_sched_in,
+
+ .cpu_dirty_log_size = PML_ENTITY_NUM,
+ .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
+
+ .nested_ops = &vmx_nested_ops,
+
+ .pi_update_irte = vmx_pi_update_irte,
+ .pi_start_assignment = vmx_pi_start_assignment,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
+
+#ifdef CONFIG_KVM_SMM
+ .smi_allowed = vmx_smi_allowed,
+ .enter_smm = vmx_enter_smm,
+ .leave_smm = vmx_leave_smm,
+ .enable_smi_window = vmx_enable_smi_window,
+#endif
+
+ .check_emulate_instruction = vmx_check_emulate_instruction,
+ .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+ .migrate_timers = vmx_migrate_timers,
+
+ .msr_filter_changed = vmx_msr_filter_changed,
+ .complete_emulated_msr = kvm_complete_insn_gp,
+
+ .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
+
+ .get_untagged_addr = vmx_get_untagged_addr,
+};
+
+struct kvm_x86_init_ops vt_init_ops __initdata = {
+ .hardware_setup = vmx_hardware_setup,
+ .handle_intel_pt_intr = NULL,
+
+ .runtime_ops = &vt_x86_ops,
+ .pmu_ops = &intel_pmu_ops,
+};
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 7c1996b433e2..b25625314658 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info)
return is_exception_n(intr_info, NM_VECTOR);
}
+static inline bool is_ve_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, VE_VECTOR);
+}
+
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 22411f4aff53..ad36e5eb6667 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -68,6 +68,7 @@
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
+#include "x86_ops.h"
#include "smm.h"
#include "vmx_onhyperv.h"
@@ -530,8 +531,6 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base;
#if IS_ENABLED(CONFIG_HYPERV)
-static struct kvm_x86_ops vmx_x86_ops __initdata;
-
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -581,9 +580,8 @@ static __init void hv_init_evmcs(void)
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_l2_tlb_flush
+ vt_x86_ops.enable_l2_tlb_flush
= hv_enable_l2_tlb_flush;
-
} else {
enlightened_vmcs = false;
}
@@ -874,6 +872,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
/*
+ * #VE isn't used for VMX. To test against unexpected changes
+ * related to #VE for VMX, intercept unexpected #VE and warn on it.
+ */
+ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ eb |= 1u << VE_VECTOR;
+ /*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
@@ -1477,7 +1481,7 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1488,7 +1492,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vmx->host_debugctlmsr = get_debugctlmsr();
}
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
@@ -1547,7 +1551,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmx->emulation_required = vmx_emulation_required(vcpu);
}
-static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
{
return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
}
@@ -1653,8 +1657,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
@@ -1738,7 +1742,7 @@ rip_updated:
* Recognizes a pending MTF VM-exit and records the nested state for later
* delivery.
*/
-static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1769,7 +1773,7 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
}
}
-static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
vmx_update_emulated_instruction(vcpu);
return skip_emulated_instruction(vcpu);
@@ -1788,7 +1792,7 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
-static void vmx_inject_exception(struct kvm_vcpu *vcpu)
+void vmx_inject_exception(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
@@ -1909,12 +1913,12 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
return kvm_caps.default_tsc_scaling_ratio;
}
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
}
-static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
}
@@ -1957,7 +1961,7 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
return !(msr->data & ~valid_bits);
}
-static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+int vmx_get_msr_feature(struct kvm_msr_entry *msr)
{
switch (msr->index) {
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
@@ -1974,7 +1978,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -2155,7 +2159,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -2458,7 +2462,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return ret;
}
-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
unsigned long guest_owned_bits;
@@ -2606,6 +2610,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_cpu_based_2nd_exec_control))
return -EIO;
}
+ if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
@@ -2630,6 +2637,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmx_cap->ept = 0;
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
vmx_cap->vpid) {
@@ -2759,7 +2767,7 @@ static bool kvm_is_vmx_supported(void)
return supported;
}
-static int vmx_check_processor_compat(void)
+int vmx_check_processor_compat(void)
{
int cpu = raw_smp_processor_id();
struct vmcs_config vmcs_conf;
@@ -2801,7 +2809,7 @@ fault:
return -EFAULT;
}
-static int vmx_hardware_enable(void)
+int vmx_hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2841,7 +2849,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-static void vmx_hardware_disable(void)
+void vmx_hardware_disable(void)
{
vmclear_local_loaded_vmcss();
@@ -3155,7 +3163,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
#endif
-static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3185,7 +3193,7 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
return to_vmx(vcpu)->vpid;
}
-static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
u64 root_hpa = mmu->root.hpa;
@@ -3201,7 +3209,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
* vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
@@ -3210,7 +3218,7 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
-static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
* vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
@@ -3255,7 +3263,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
CPU_BASED_CR3_STORE_EXITING)
-static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
if (is_guest_mode(vcpu))
return nested_guest_cr0_valid(vcpu, cr0);
@@ -3376,8 +3384,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
return eptp;
}
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
- int root_level)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
@@ -3406,8 +3413,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcs_writel(GUEST_CR3, guest_cr3);
}
-
-static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
/*
* We operate under the default treatment of SMM, so VMX cannot be
@@ -3523,7 +3529,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
var->g = (ar >> 15) & 1;
}
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment s;
@@ -3600,14 +3606,14 @@ void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
}
-static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
__vmx_set_segment(vcpu, var, seg);
to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
}
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
@@ -3615,25 +3621,25 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
*l = (ar >> 13) & 1;
}
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
dt->address = vmcs_readl(GUEST_IDTR_BASE);
}
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
vmcs_writel(GUEST_IDTR_BASE, dt->address);
}
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
dt->address = vmcs_readl(GUEST_GDTR_BASE);
}
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
vmcs_writel(GUEST_GDTR_BASE, dt->address);
@@ -4101,7 +4107,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
void *vapic_page;
@@ -4121,7 +4127,7 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
return ((rvi & 0xf0) > (vppr & 0xf0));
}
-static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 i;
@@ -4265,8 +4271,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
return 0;
}
-static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
- int trig_mode, int vector)
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
{
struct kvm_vcpu *vcpu = apic->vcpu;
@@ -4428,7 +4434,7 @@ static u32 vmx_vmexit_ctrl(void)
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
}
-static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4594,6 +4600,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
enable_unrestricted_guest = 0;
}
if (!enable_unrestricted_guest)
@@ -4692,7 +4699,7 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
return 0;
}
-static int vmx_vcpu_precreate(struct kvm *kvm)
+int vmx_vcpu_precreate(struct kvm *kvm)
{
return vmx_alloc_ipiv_pid_table(kvm);
}
@@ -4717,8 +4724,12 @@ static void init_vmcs(struct vcpu_vmx *vmx)
exec_controls_set(vmx, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls())
+ if (cpu_has_secondary_exec_ctrls()) {
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS,
+ __pa(vmx->ve_info));
+ }
if (cpu_has_tertiary_exec_ctrls())
tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
@@ -4847,7 +4858,7 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->pi_desc.sn = 1;
}
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4906,12 +4917,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_update_fb_clear_dis(vcpu, vmx);
}
-static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
{
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
-static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
{
if (!enable_vnmi ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
@@ -4922,7 +4933,7 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
uint32_t intr;
@@ -4950,7 +4961,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
vmx_clear_hlt(vcpu);
}
-static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5028,7 +5039,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
GUEST_INTR_STATE_NMI));
}
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
@@ -5050,7 +5061,7 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
@@ -5065,7 +5076,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_interrupt_blocked(vcpu);
}
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
void __user *ret;
@@ -5085,7 +5096,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
return init_rmode_tss(kvm, ret);
}
-static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
{
to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
return 0;
@@ -5206,6 +5217,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
+ if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
+ return -EIO;
+
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
@@ -5371,8 +5385,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
return kvm_fast_pio(vcpu, size, port, in);
}
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
/*
* Patch in the VMCALL instruction:
@@ -5578,7 +5591,7 @@ out:
return kvm_complete_insn_gp(vcpu, err);
}
-static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
@@ -5597,7 +5610,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
-static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
}
@@ -5868,7 +5881,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1;
}
-static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
if (vmx_emulation_required_with_pending_exception(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
@@ -6156,9 +6169,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
- u64 *info1, u64 *info2,
- u32 *intr_info, u32 *error_code)
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6416,6 +6428,24 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
pr_err("Virtual processor ID = 0x%04x\n",
vmcs_read16(VIRTUAL_PROCESSOR_ID));
+ if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+ u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
+
+ /*
+ * If KVM is dumping the VMCS, then something has gone wrong
+ * already. Derefencing an address from the VMCS, which could
+ * very well be corrupted, is a terrible idea. The virtual
+ * address is known so use it.
+ */
+ pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
+ ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
+ pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
+ ve_info->exit_reason, ve_info->delivery,
+ ve_info->exit_qualification,
+ ve_info->guest_linear_address,
+ ve_info->guest_physical_address, ve_info->eptp_index);
+ }
}
/*
@@ -6601,7 +6631,7 @@ unexpected_vmexit:
return 0;
}
-static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
@@ -6689,7 +6719,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
: "eax", "ebx", "ecx", "edx");
}
-static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
int tpr_threshold;
@@ -6759,7 +6789,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
vmx_update_msr_bitmap_x2apic(vcpu);
}
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
{
const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
@@ -6828,7 +6858,7 @@ out:
kvm_release_pfn_clean(pfn);
}
-static void vmx_hwapic_isr_update(int max_isr)
+void vmx_hwapic_isr_update(int max_isr)
{
u16 status;
u8 old;
@@ -6862,7 +6892,7 @@ static void vmx_set_rvi(int vector)
}
}
-static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
{
/*
* When running L2, updating RVI is only relevant when
@@ -6876,7 +6906,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
vmx_set_rvi(max_irr);
}
-static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
@@ -6922,7 +6952,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
return max_irr;
}
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
return;
@@ -6933,7 +6963,7 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
-static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
+void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6964,24 +6994,22 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
-static void handle_exception_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
{
- u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
-
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
- vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
/* if exit due to NM, handle before interrupts are enabled */
else if (is_nm_fault(intr_info))
- handle_nm_fault_irqoff(&vmx->vcpu);
+ handle_nm_fault_irqoff(vcpu);
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
}
-static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
+static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
+ u32 intr_info)
{
- u32 intr_info = vmx_get_intr_info(vcpu);
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
@@ -6998,7 +7026,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
vcpu->arch.at_instruction_boundary = true;
}
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7006,16 +7034,16 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
return;
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
- handle_external_interrupt_irqoff(vcpu);
+ handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
- handle_exception_irqoff(vmx);
+ handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
}
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
*/
-static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
@@ -7138,7 +7166,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
IDT_VECTORING_ERROR_CODE);
}
-static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+void vmx_cancel_injection(struct kvm_vcpu *vcpu)
{
__vmx_complete_interrupts(vcpu,
vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
@@ -7308,7 +7336,7 @@ out:
guest_state_exit_irqoff();
}
-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7463,7 +7491,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}
-static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7472,9 +7500,10 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
free_vpid(vmx->vpid);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
+ free_page((unsigned long)vmx->ve_info);
}
-static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
+int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
@@ -7565,6 +7594,20 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
+ err = -ENOMEM;
+ if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct page *page;
+
+ BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
+
+ /* ve_info must be page aligned. */
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!page)
+ goto free_vmcs;
+
+ vmx->ve_info = page_to_virt(page);
+ }
+
if (vmx_can_use_ipiv(vcpu))
WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
__pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
@@ -7583,7 +7626,7 @@ free_vpid:
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
-static int vmx_vm_init(struct kvm *kvm)
+int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
kvm->arch.pause_in_guest = true;
@@ -7614,7 +7657,7 @@ static int vmx_vm_init(struct kvm *kvm)
return 0;
}
-static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
* memory aliases with conflicting memory types and sometimes MCEs.
@@ -7786,7 +7829,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
-static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8001,10 +8044,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
}
-static int vmx_check_intercept(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info,
- enum x86_intercept_stage stage,
- struct x86_exception *exception)
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -8084,8 +8127,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
return 0;
}
-static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
- bool *expired)
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
{
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
@@ -8124,13 +8167,13 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
return 0;
}
-static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
to_vmx(vcpu)->hv_deadline_tsc = -1;
}
#endif
-static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
+void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
if (!kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
@@ -8159,7 +8202,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
-static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
@@ -8170,7 +8213,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
}
#ifdef CONFIG_KVM_SMM
-static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
if (to_vmx(vcpu)->nested.nested_run_pending)
@@ -8178,7 +8221,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8199,7 +8242,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
return 0;
}
-static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -8220,18 +8263,18 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
return 0;
}
-static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
#endif
-static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
}
-static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
+void vmx_migrate_timers(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
@@ -8241,7 +8284,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
}
}
-static void vmx_hardware_unsetup(void)
+void vmx_hardware_unsetup(void)
{
kvm_set_posted_intr_wakeup_handler(NULL);
@@ -8251,18 +8294,7 @@ static void vmx_hardware_unsetup(void)
free_kvm_area();
}
-#define VMX_REQUIRED_APICV_INHIBITS \
-( \
- BIT(APICV_INHIBIT_REASON_DISABLE)| \
- BIT(APICV_INHIBIT_REASON_ABSENT) | \
- BIT(APICV_INHIBIT_REASON_HYPERV) | \
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
- BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
-)
-
-static void vmx_vm_destroy(struct kvm *kvm)
+void vmx_vm_destroy(struct kvm *kvm)
{
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
@@ -8313,148 +8345,6 @@ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags
return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
}
-static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = KBUILD_MODNAME,
-
- .check_processor_compatibility = vmx_check_processor_compat,
-
- .hardware_unsetup = vmx_hardware_unsetup,
-
- .hardware_enable = vmx_hardware_enable,
- .hardware_disable = vmx_hardware_disable,
- .has_emulated_msr = vmx_has_emulated_msr,
-
- .vm_size = sizeof(struct kvm_vmx),
- .vm_init = vmx_vm_init,
- .vm_destroy = vmx_vm_destroy,
-
- .vcpu_precreate = vmx_vcpu_precreate,
- .vcpu_create = vmx_vcpu_create,
- .vcpu_free = vmx_vcpu_free,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .update_exception_bitmap = vmx_update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .is_valid_cr0 = vmx_is_valid_cr0,
- .set_cr0 = vmx_set_cr0,
- .is_valid_cr4 = vmx_is_valid_cr4,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
- .get_if_flag = vmx_get_if_flag,
-
- .flush_tlb_all = vmx_flush_tlb_all,
- .flush_tlb_current = vmx_flush_tlb_current,
- .flush_tlb_gva = vmx_flush_tlb_gva,
- .flush_tlb_guest = vmx_flush_tlb_guest,
-
- .vcpu_pre_run = vmx_vcpu_pre_run,
- .vcpu_run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = vmx_skip_emulated_instruction,
- .update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .inject_irq = vmx_inject_irq,
- .inject_nmi = vmx_inject_nmi,
- .inject_exception = vmx_inject_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = vmx_enable_nmi_window,
- .enable_irq_window = vmx_enable_irq_window,
- .update_cr8_intercept = vmx_update_cr8_intercept,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
- .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
- .hwapic_irr_update = vmx_hwapic_irr_update,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_interrupt = vmx_deliver_interrupt,
- .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
-
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
- .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
- .write_tsc_offset = vmx_write_tsc_offset,
- .write_tsc_multiplier = vmx_write_tsc_multiplier,
-
- .load_mmu_pgd = vmx_load_mmu_pgd,
-
- .check_intercept = vmx_check_intercept,
- .handle_exit_irqoff = vmx_handle_exit_irqoff,
-
- .sched_in = vmx_sched_in,
-
- .cpu_dirty_log_size = PML_ENTITY_NUM,
- .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
-
- .nested_ops = &vmx_nested_ops,
-
- .pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
-
-#ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
-#endif
-
- .setup_mce = vmx_setup_mce,
-
-#ifdef CONFIG_KVM_SMM
- .smi_allowed = vmx_smi_allowed,
- .enter_smm = vmx_enter_smm,
- .leave_smm = vmx_leave_smm,
- .enable_smi_window = vmx_enable_smi_window,
-#endif
-
- .check_emulate_instruction = vmx_check_emulate_instruction,
- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
- .migrate_timers = vmx_migrate_timers,
-
- .msr_filter_changed = vmx_msr_filter_changed,
- .complete_emulated_msr = kvm_complete_insn_gp,
-
- .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
-
- .get_untagged_addr = vmx_get_untagged_addr,
-};
-
static unsigned int vmx_handle_intel_pt_intr(void)
{
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
@@ -8520,9 +8410,7 @@ static void __init vmx_setup_me_spte_mask(void)
kvm_mmu_set_me_spte_mask(0, me_mask);
}
-static struct kvm_x86_init_ops vmx_init_ops __initdata;
-
-static __init int hardware_setup(void)
+__init int vmx_hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
@@ -8591,16 +8479,16 @@ static __init int hardware_setup(void)
* using the APIC_ACCESS_ADDR VMCS field.
*/
if (!flexpriority_enabled)
- vmx_x86_ops.set_apic_access_page_addr = NULL;
+ vt_x86_ops.set_apic_access_page_addr = NULL;
if (!cpu_has_vmx_tpr_shadow())
- vmx_x86_ops.update_cr8_intercept = NULL;
+ vt_x86_ops.update_cr8_intercept = NULL;
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
- vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
+ vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
#endif
@@ -8615,7 +8503,7 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_apicv())
enable_apicv = 0;
if (!enable_apicv)
- vmx_x86_ops.sync_pir_to_irr = NULL;
+ vt_x86_ops.sync_pir_to_irr = NULL;
if (!enable_apicv || !cpu_has_vmx_ipiv())
enable_ipiv = false;
@@ -8651,7 +8539,7 @@ static __init int hardware_setup(void)
enable_pml = 0;
if (!enable_pml)
- vmx_x86_ops.cpu_dirty_log_size = 0;
+ vt_x86_ops.cpu_dirty_log_size = 0;
if (!cpu_has_vmx_preemption_timer())
enable_preemption_timer = false;
@@ -8676,8 +8564,8 @@ static __init int hardware_setup(void)
}
if (!enable_preemption_timer) {
- vmx_x86_ops.set_hv_timer = NULL;
- vmx_x86_ops.cancel_hv_timer = NULL;
+ vt_x86_ops.set_hv_timer = NULL;
+ vt_x86_ops.cancel_hv_timer = NULL;
}
kvm_caps.supported_mce_cap |= MCG_LMCE_P;
@@ -8688,9 +8576,9 @@ static __init int hardware_setup(void)
if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
if (pt_mode == PT_MODE_HOST_GUEST)
- vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
else
- vmx_init_ops.handle_intel_pt_intr = NULL;
+ vt_init_ops.handle_intel_pt_intr = NULL;
setup_default_sgx_lepubkeyhash();
@@ -8713,14 +8601,6 @@ static __init int hardware_setup(void)
return r;
}
-static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .hardware_setup = hardware_setup,
- .handle_intel_pt_intr = NULL,
-
- .runtime_ops = &vmx_x86_ops,
- .pmu_ops = &intel_pmu_ops,
-};
-
static void vmx_cleanup_l1d_flush(void)
{
if (vmx_l1d_flush_pages) {
@@ -8762,7 +8642,7 @@ static int __init vmx_init(void)
*/
hv_init_evmcs();
- r = kvm_x86_vendor_init(&vmx_init_ops);
+ r = kvm_x86_vendor_init(&vt_init_ops);
if (r)
return r;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 90f9e4434646..e4a605c5808e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -365,6 +365,9 @@ struct vcpu_vmx {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
} shadow_msr_intercept;
+
+ /* ve_info must be page aligned. */
+ struct vmx_ve_information *ve_info;
};
struct kvm_vmx {
@@ -577,7 +580,8 @@ static inline u8 vmx_get_rvi(void)
SECONDARY_EXEC_ENABLE_VMFUNC | \
SECONDARY_EXEC_BUS_LOCK_DETECTION | \
SECONDARY_EXEC_NOTIFY_VM_EXITING | \
- SECONDARY_EXEC_ENCLS_EXITING)
+ SECONDARY_EXEC_ENCLS_EXITING | \
+ SECONDARY_EXEC_EPT_VIOLATION_VE)
#define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
new file mode 100644
index 000000000000..502704596c83
--- /dev/null
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_X86_OPS_H
+#define __KVM_X86_VMX_X86_OPS_H
+
+#include <linux/kvm_host.h>
+
+#include "x86.h"
+
+__init int vmx_hardware_setup(void);
+
+extern struct kvm_x86_ops vt_x86_ops __initdata;
+extern struct kvm_x86_init_ops vt_init_ops __initdata;
+
+void vmx_hardware_unsetup(void);
+int vmx_check_processor_compat(void);
+int vmx_hardware_enable(void);
+void vmx_hardware_disable(void);
+int vmx_vm_init(struct kvm *kvm);
+void vmx_vm_destroy(struct kvm *kvm);
+int vmx_vcpu_precreate(struct kvm *kvm);
+int vmx_vcpu_create(struct kvm_vcpu *vcpu);
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+void vmx_vcpu_free(struct kvm_vcpu *vcpu);
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_put(struct kvm_vcpu *vcpu);
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath);
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu);
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu);
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+#ifdef CONFIG_KVM_SMM
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram);
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu);
+#endif
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception);
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu);
+void vmx_migrate_timers(struct kvm_vcpu *vcpu);
+void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
+bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
+void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
+void vmx_hwapic_isr_update(int max_isr);
+bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu);
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector);
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
+void vmx_msr_filter_changed(struct kvm_vcpu *vcpu);
+void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
+int vmx_get_msr_feature(struct kvm_msr_entry *msr);
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
+void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int vmx_get_cpl(struct kvm_vcpu *vcpu);
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr);
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu);
+void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
+u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall);
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected);
+void vmx_inject_nmi(struct kvm_vcpu *vcpu);
+void vmx_inject_exception(struct kvm_vcpu *vcpu);
+void vmx_cancel_injection(struct kvm_vcpu *vcpu);
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu);
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu);
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr);
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu);
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr);
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_request_immediate_exit(struct kvm_vcpu *vcpu);
+void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu);
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_X86_64
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired);
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
+#endif
+void vmx_setup_mce(struct kvm_vcpu *vcpu);
+
+#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 91478b769af0..fda22b3800a1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -92,9 +92,12 @@
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
-struct kvm_caps kvm_caps __read_mostly = {
- .supported_mce_cap = MCG_CTL_P | MCG_SER_P,
-};
+/*
+ * Note, kvm_caps fields should *never* have default values, all fields must be
+ * recomputed from scratch during vendor module load, e.g. to account for a
+ * vendor module being reloaded with different module parameters.
+ */
+struct kvm_caps kvm_caps __read_mostly;
EXPORT_SYMBOL_GPL(kvm_caps);
#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
@@ -4629,9 +4632,7 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
static bool kvm_is_vm_type_supported(unsigned long type)
{
- return type == KVM_X86_DEFAULT_VM ||
- (type == KVM_X86_SW_PROTECTED_VM &&
- IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled);
+ return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -4832,9 +4833,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = kvm_caps.has_notify_vmexit;
break;
case KVM_CAP_VM_TYPES:
- r = BIT(KVM_X86_DEFAULT_VM);
- if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM))
- r |= BIT(KVM_X86_SW_PROTECTED_VM);
+ r = kvm_caps.supported_vm_types;
break;
default:
break;
@@ -4842,46 +4841,44 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
return r;
}
-static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
-{
- void __user *uaddr = (void __user*)(unsigned long)attr->addr;
-
- if ((u64)(unsigned long)uaddr != attr->addr)
- return ERR_PTR_USR(-EFAULT);
- return uaddr;
-}
-
-static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val)
{
- u64 __user *uaddr = kvm_get_attr_addr(attr);
-
- if (attr->group)
+ if (attr->group) {
+ if (kvm_x86_ops.dev_get_attr)
+ return static_call(kvm_x86_dev_get_attr)(attr->group, attr->attr, val);
return -ENXIO;
-
- if (IS_ERR(uaddr))
- return PTR_ERR(uaddr);
+ }
switch (attr->attr) {
case KVM_X86_XCOMP_GUEST_SUPP:
- if (put_user(kvm_caps.supported_xcr0, uaddr))
- return -EFAULT;
+ *val = kvm_caps.supported_xcr0;
return 0;
default:
return -ENXIO;
}
}
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
+ int r;
+ u64 val;
+
+ r = __kvm_x86_dev_get_attr(attr, &val);
+ if (r < 0)
+ return r;
+
+ if (put_user(val, uaddr))
+ return -EFAULT;
+
+ return 0;
+}
+
static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
{
- if (attr->group)
- return -ENXIO;
+ u64 val;
- switch (attr->attr) {
- case KVM_X86_XCOMP_GUEST_SUPP:
- return 0;
- default:
- return -ENXIO;
- }
+ return __kvm_x86_dev_get_attr(attr, &val);
}
long kvm_arch_dev_ioctl(struct file *filp,
@@ -5557,11 +5554,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
return 0;
}
-static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
+static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
{
unsigned int i;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
memset(dbgregs, 0, sizeof(*dbgregs));
BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
@@ -5570,6 +5571,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
dbgregs->dr6 = vcpu->arch.dr6;
dbgregs->dr7 = vcpu->arch.dr7;
+ return 0;
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -5577,6 +5579,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
{
unsigned int i;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (dbgregs->flags)
return -EINVAL;
@@ -5597,8 +5603,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
}
-static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
- u8 *state, unsigned int size)
+static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+ u8 *state, unsigned int size)
{
/*
* Only copy state for features that are enabled for the guest. The
@@ -5616,24 +5622,25 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
XFEATURE_MASK_FPSSE;
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
supported_xcr0, vcpu->arch.pkru);
+ return 0;
}
-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
+static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
{
- kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
- sizeof(guest_xsave->region));
+ return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+ sizeof(guest_xsave->region));
}
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
guest_xsave->region,
@@ -5641,18 +5648,23 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
&vcpu->arch.pkru);
}
-static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
- struct kvm_xcrs *guest_xcrs)
+static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
guest_xcrs->nr_xcrs = 0;
- return;
+ return 0;
}
guest_xcrs->nr_xcrs = 1;
guest_xcrs->flags = 0;
guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+ return 0;
}
static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
@@ -5660,6 +5672,10 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
{
int i, r = 0;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return -EINVAL;
@@ -5712,12 +5728,9 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- u64 __user *uaddr = kvm_get_attr_addr(attr);
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
int r;
- if (IS_ERR(uaddr))
- return PTR_ERR(uaddr);
-
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET:
r = -EFAULT;
@@ -5735,13 +5748,10 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- u64 __user *uaddr = kvm_get_attr_addr(attr);
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
struct kvm *kvm = vcpu->kvm;
int r;
- if (IS_ERR(uaddr))
- return PTR_ERR(uaddr);
-
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET: {
u64 offset, tsc, ns;
@@ -6048,7 +6058,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_GET_DEBUGREGS: {
struct kvm_debugregs dbgregs;
- kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+ r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, &dbgregs,
@@ -6078,7 +6090,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!u.xsave)
break;
- kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+ r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
@@ -6107,7 +6121,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!u.xsave)
break;
- kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+ r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, u.xsave, size))
@@ -6123,7 +6139,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!u.xcrs)
break;
- kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+ r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, u.xcrs,
@@ -6267,6 +6285,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
#endif
case KVM_GET_SREGS2: {
+ r = -EINVAL;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ goto out;
+
u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
r = -ENOMEM;
if (!u.sregs2)
@@ -6279,6 +6302,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_SREGS2: {
+ r = -EINVAL;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ goto out;
+
u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
if (IS_ERR(u.sregs2)) {
r = PTR_ERR(u.sregs2);
@@ -9732,6 +9760,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
return -EIO;
}
+ memset(&kvm_caps, 0, sizeof(kvm_caps));
+
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
pr_err("failed to allocate cache for x86 emulator\n");
@@ -9750,6 +9780,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (r)
goto out_free_percpu;
+ kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
+ kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
+
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
@@ -9795,6 +9828,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled)
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM);
+
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
kvm_caps.supported_xss = 0;
@@ -10051,26 +10087,15 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit, int cpl)
{
- unsigned long nr, a0, a1, a2, a3, ret;
- int op_64_bit;
-
- if (kvm_xen_hypercall_enabled(vcpu->kvm))
- return kvm_xen_hypercall(vcpu);
-
- if (kvm_hv_hypercall_enabled(vcpu))
- return kvm_hv_hypercall(vcpu);
-
- nr = kvm_rax_read(vcpu);
- a0 = kvm_rbx_read(vcpu);
- a1 = kvm_rcx_read(vcpu);
- a2 = kvm_rdx_read(vcpu);
- a3 = kvm_rsi_read(vcpu);
+ unsigned long ret;
trace_kvm_hypercall(nr, a0, a1, a2, a3);
- op_64_bit = is_64_bit_hypercall(vcpu);
if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
@@ -10079,7 +10104,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0xFFFFFFFF;
}
- if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
+ if (cpl) {
ret = -KVM_EPERM;
goto out;
}
@@ -10140,18 +10165,49 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+ /* stat is incremented on completion. */
return 0;
}
default:
ret = -KVM_ENOSYS;
break;
}
+
out:
+ ++vcpu->stat.hypercalls;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+ unsigned long nr, a0, a1, a2, a3, ret;
+ int op_64_bit;
+ int cpl;
+
+ if (kvm_xen_hypercall_enabled(vcpu->kvm))
+ return kvm_xen_hypercall(vcpu);
+
+ if (kvm_hv_hypercall_enabled(vcpu))
+ return kvm_hv_hypercall(vcpu);
+
+ nr = kvm_rax_read(vcpu);
+ a0 = kvm_rbx_read(vcpu);
+ a1 = kvm_rcx_read(vcpu);
+ a2 = kvm_rdx_read(vcpu);
+ a3 = kvm_rsi_read(vcpu);
+ op_64_bit = is_64_bit_hypercall(vcpu);
+ cpl = static_call(kvm_x86_get_cpl)(vcpu);
+
+ ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
+ if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
+ /* MAP_GPA tosses the request to the user space. */
+ return 0;
+
if (!op_64_bit)
ret = (u32)ret;
kvm_rax_write(vcpu, ret);
- ++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@ -11486,6 +11542,10 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__get_regs(vcpu, regs);
vcpu_put(vcpu);
@@ -11527,6 +11587,10 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__set_regs(vcpu, regs);
vcpu_put(vcpu);
@@ -11599,6 +11663,10 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__get_sregs(vcpu, sregs);
vcpu_put(vcpu);
@@ -11866,6 +11934,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
{
int ret;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
ret = __set_sregs(vcpu, sregs);
vcpu_put(vcpu);
@@ -11983,7 +12055,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
struct fxregs_state *fxsave;
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
vcpu_load(vcpu);
@@ -12006,7 +12078,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
struct fxregs_state *fxsave;
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
vcpu_load(vcpu);
@@ -12532,6 +12604,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return -EINVAL;
kvm->arch.vm_type = type;
+ kvm->arch.has_private_mem =
+ (type == KVM_X86_SW_PROTECTED_VM);
ret = kvm_page_track_init(kvm);
if (ret)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a8b71803777b..d80a4c6b5a38 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -24,6 +24,8 @@ struct kvm_caps {
bool has_bus_lock_exit;
/* notify VM exit supported? */
bool has_notify_vmexit;
+ /* bit mask of VM types */
+ u32 supported_vm_types;
u64 supported_mce_cap;
u64 supported_xcr0;
diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c
index b4efdddb2ad9..78c490e0505a 100644
--- a/drivers/perf/riscv_pmu.c
+++ b/drivers/perf/riscv_pmu.c
@@ -191,8 +191,6 @@ void riscv_pmu_stop(struct perf_event *event, int flags)
struct hw_perf_event *hwc = &event->hw;
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
- WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-
if (!(hwc->state & PERF_HES_STOPPED)) {
if (rvpmu->ctr_stop) {
rvpmu->ctr_stop(event, 0);
@@ -408,6 +406,7 @@ struct riscv_pmu *riscv_pmu_alloc(void)
cpuc->n_events = 0;
for (i = 0; i < RISCV_MAX_COUNTERS; i++)
cpuc->events[i] = NULL;
+ cpuc->snapshot_addr = NULL;
}
pmu->pmu = (struct pmu) {
.event_init = riscv_pmu_event_init,
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index 8cbe6e5f9c39..5d699b06dcb6 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -27,7 +27,7 @@
#define ALT_SBI_PMU_OVERFLOW(__ovl) \
asm volatile(ALTERNATIVE_2( \
- "csrr %0, " __stringify(CSR_SSCOUNTOVF), \
+ "csrr %0, " __stringify(CSR_SCOUNTOVF), \
"csrr %0, " __stringify(THEAD_C9XX_CSR_SCOUNTEROF), \
THEAD_VENDOR_ID, ERRATA_THEAD_PMU, \
CONFIG_ERRATA_THEAD_PMU, \
@@ -57,6 +57,11 @@ asm volatile(ALTERNATIVE( \
PMU_FORMAT_ATTR(event, "config:0-47");
PMU_FORMAT_ATTR(firmware, "config:63");
+static bool sbi_v2_available;
+static DEFINE_STATIC_KEY_FALSE(sbi_pmu_snapshot_available);
+#define sbi_pmu_snapshot_available() \
+ static_branch_unlikely(&sbi_pmu_snapshot_available)
+
static struct attribute *riscv_arch_formats_attr[] = {
&format_attr_event.attr,
&format_attr_firmware.attr,
@@ -384,7 +389,7 @@ static int pmu_sbi_ctr_get_idx(struct perf_event *event)
cmask = 1;
} else if (event->attr.config == PERF_COUNT_HW_INSTRUCTIONS) {
cflags |= SBI_PMU_CFG_FLAG_SKIP_MATCH;
- cmask = 1UL << (CSR_INSTRET - CSR_CYCLE);
+ cmask = BIT(CSR_INSTRET - CSR_CYCLE);
}
}
@@ -506,24 +511,126 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig)
return ret;
}
+static void pmu_sbi_snapshot_free(struct riscv_pmu *pmu)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct cpu_hw_events *cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
+
+ if (!cpu_hw_evt->snapshot_addr)
+ continue;
+
+ free_page((unsigned long)cpu_hw_evt->snapshot_addr);
+ cpu_hw_evt->snapshot_addr = NULL;
+ cpu_hw_evt->snapshot_addr_phys = 0;
+ }
+}
+
+static int pmu_sbi_snapshot_alloc(struct riscv_pmu *pmu)
+{
+ int cpu;
+ struct page *snapshot_page;
+
+ for_each_possible_cpu(cpu) {
+ struct cpu_hw_events *cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
+
+ snapshot_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+ if (!snapshot_page) {
+ pmu_sbi_snapshot_free(pmu);
+ return -ENOMEM;
+ }
+ cpu_hw_evt->snapshot_addr = page_to_virt(snapshot_page);
+ cpu_hw_evt->snapshot_addr_phys = page_to_phys(snapshot_page);
+ }
+
+ return 0;
+}
+
+static int pmu_sbi_snapshot_disable(void)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, SBI_SHMEM_DISABLE,
+ SBI_SHMEM_DISABLE, 0, 0, 0, 0);
+ if (ret.error) {
+ pr_warn("failed to disable snapshot shared memory\n");
+ return sbi_err_map_linux_errno(ret.error);
+ }
+
+ return 0;
+}
+
+static int pmu_sbi_snapshot_setup(struct riscv_pmu *pmu, int cpu)
+{
+ struct cpu_hw_events *cpu_hw_evt;
+ struct sbiret ret = {0};
+
+ cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
+ if (!cpu_hw_evt->snapshot_addr_phys)
+ return -EINVAL;
+
+ if (cpu_hw_evt->snapshot_set_done)
+ return 0;
+
+ if (IS_ENABLED(CONFIG_32BIT))
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM,
+ cpu_hw_evt->snapshot_addr_phys,
+ (u64)(cpu_hw_evt->snapshot_addr_phys) >> 32, 0, 0, 0, 0);
+ else
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM,
+ cpu_hw_evt->snapshot_addr_phys, 0, 0, 0, 0, 0);
+
+ /* Free up the snapshot area memory and fall back to SBI PMU calls without snapshot */
+ if (ret.error) {
+ if (ret.error != SBI_ERR_NOT_SUPPORTED)
+ pr_warn("pmu snapshot setup failed with error %ld\n", ret.error);
+ return sbi_err_map_linux_errno(ret.error);
+ }
+
+ memset(cpu_hw_evt->snapshot_cval_shcopy, 0, sizeof(u64) * RISCV_MAX_COUNTERS);
+ cpu_hw_evt->snapshot_set_done = true;
+
+ return 0;
+}
+
static u64 pmu_sbi_ctr_read(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
struct sbiret ret;
- union sbi_pmu_ctr_info info;
u64 val = 0;
+ struct riscv_pmu *pmu = to_riscv_pmu(event->pmu);
+ struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+ struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
+ union sbi_pmu_ctr_info info = pmu_ctr_list[idx];
+
+ /* Read the value from the shared memory directly only if counter is stopped */
+ if (sbi_pmu_snapshot_available() && (hwc->state & PERF_HES_STOPPED)) {
+ val = sdata->ctr_values[idx];
+ return val;
+ }
if (pmu_sbi_is_fw_event(event)) {
ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ,
hwc->idx, 0, 0, 0, 0, 0);
- if (!ret.error)
- val = ret.value;
+ if (ret.error)
+ return 0;
+
+ val = ret.value;
+ if (IS_ENABLED(CONFIG_32BIT) && sbi_v2_available && info.width >= 32) {
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ_HI,
+ hwc->idx, 0, 0, 0, 0, 0);
+ if (!ret.error)
+ val |= ((u64)ret.value << 32);
+ else
+ WARN_ONCE(1, "Unable to read upper 32 bits of firmware counter error: %ld\n",
+ ret.error);
+ }
} else {
- info = pmu_ctr_list[idx];
val = riscv_pmu_ctr_read_csr(info.csr);
if (IS_ENABLED(CONFIG_32BIT))
- val = ((u64)riscv_pmu_ctr_read_csr(info.csr + 0x80)) << 31 | val;
+ val |= ((u64)riscv_pmu_ctr_read_csr(info.csr + 0x80)) << 32;
}
return val;
@@ -553,6 +660,7 @@ static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival)
struct hw_perf_event *hwc = &event->hw;
unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE;
+ /* There is no benefit setting SNAPSHOT FLAG for a single counter */
#if defined(CONFIG_32BIT)
ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, hwc->idx,
1, flag, ival, ival >> 32, 0);
@@ -573,16 +681,36 @@ static void pmu_sbi_ctr_stop(struct perf_event *event, unsigned long flag)
{
struct sbiret ret;
struct hw_perf_event *hwc = &event->hw;
+ struct riscv_pmu *pmu = to_riscv_pmu(event->pmu);
+ struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+ struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
if ((hwc->flags & PERF_EVENT_FLAG_USER_ACCESS) &&
(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
pmu_sbi_reset_scounteren((void *)event);
+ if (sbi_pmu_snapshot_available())
+ flag |= SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
+
ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, hwc->idx, 1, flag, 0, 0, 0);
- if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) &&
- flag != SBI_PMU_STOP_FLAG_RESET)
+ if (!ret.error && sbi_pmu_snapshot_available()) {
+ /*
+ * The counter snapshot is based on the index base specified by hwc->idx.
+ * The actual counter value is updated in shared memory at index 0 when counter
+ * mask is 0x01. To ensure accurate counter values, it's necessary to transfer
+ * the counter value to shared memory. However, if hwc->idx is zero, the counter
+ * value is already correctly updated in shared memory, requiring no further
+ * adjustment.
+ */
+ if (hwc->idx > 0) {
+ sdata->ctr_values[hwc->idx] = sdata->ctr_values[0];
+ sdata->ctr_values[0] = 0;
+ }
+ } else if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) &&
+ flag != SBI_PMU_STOP_FLAG_RESET) {
pr_err("Stopping counter idx %d failed with error %d\n",
hwc->idx, sbi_err_map_linux_errno(ret.error));
+ }
}
static int pmu_sbi_find_num_ctrs(void)
@@ -640,10 +768,39 @@ static inline void pmu_sbi_stop_all(struct riscv_pmu *pmu)
static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu)
{
struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+ struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
+ unsigned long flag = 0;
+ int i, idx;
+ struct sbiret ret;
+ u64 temp_ctr_overflow_mask = 0;
+
+ if (sbi_pmu_snapshot_available())
+ flag = SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
+
+ /* Reset the shadow copy to avoid save/restore any value from previous overflow */
+ memset(cpu_hw_evt->snapshot_cval_shcopy, 0, sizeof(u64) * RISCV_MAX_COUNTERS);
+
+ for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++) {
+ /* No need to check the error here as we can't do anything about the error */
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, i * BITS_PER_LONG,
+ cpu_hw_evt->used_hw_ctrs[i], flag, 0, 0, 0);
+ if (!ret.error && sbi_pmu_snapshot_available()) {
+ /* Save the counter values to avoid clobbering */
+ for_each_set_bit(idx, &cpu_hw_evt->used_hw_ctrs[i], BITS_PER_LONG)
+ cpu_hw_evt->snapshot_cval_shcopy[i * BITS_PER_LONG + idx] =
+ sdata->ctr_values[idx];
+ /* Save the overflow mask to avoid clobbering */
+ temp_ctr_overflow_mask |= sdata->ctr_overflow_mask << (i * BITS_PER_LONG);
+ }
+ }
- /* No need to check the error here as we can't do anything about the error */
- sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, 0,
- cpu_hw_evt->used_hw_ctrs[0], 0, 0, 0, 0);
+ /* Restore the counter values to the shared memory for used hw counters */
+ if (sbi_pmu_snapshot_available()) {
+ for_each_set_bit(idx, cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS)
+ sdata->ctr_values[idx] = cpu_hw_evt->snapshot_cval_shcopy[idx];
+ if (temp_ctr_overflow_mask)
+ sdata->ctr_overflow_mask = temp_ctr_overflow_mask;
+ }
}
/*
@@ -652,11 +809,10 @@ static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu)
* while the overflowed counters need to be started with updated initialization
* value.
*/
-static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
- unsigned long ctr_ovf_mask)
+static inline void pmu_sbi_start_ovf_ctrs_sbi(struct cpu_hw_events *cpu_hw_evt,
+ u64 ctr_ovf_mask)
{
- int idx = 0;
- struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+ int idx = 0, i;
struct perf_event *event;
unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE;
unsigned long ctr_start_mask = 0;
@@ -664,11 +820,12 @@ static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
struct hw_perf_event *hwc;
u64 init_val = 0;
- ctr_start_mask = cpu_hw_evt->used_hw_ctrs[0] & ~ctr_ovf_mask;
-
- /* Start all the counters that did not overflow in a single shot */
- sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, 0, ctr_start_mask,
- 0, 0, 0, 0);
+ for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++) {
+ ctr_start_mask = cpu_hw_evt->used_hw_ctrs[i] & ~ctr_ovf_mask;
+ /* Start all the counters that did not overflow in a single shot */
+ sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, i * BITS_PER_LONG, ctr_start_mask,
+ 0, 0, 0, 0);
+ }
/* Reinitialize and start all the counter that overflowed */
while (ctr_ovf_mask) {
@@ -691,6 +848,52 @@ static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
}
}
+static inline void pmu_sbi_start_ovf_ctrs_snapshot(struct cpu_hw_events *cpu_hw_evt,
+ u64 ctr_ovf_mask)
+{
+ int i, idx = 0;
+ struct perf_event *event;
+ unsigned long flag = SBI_PMU_START_FLAG_INIT_SNAPSHOT;
+ u64 max_period, init_val = 0;
+ struct hw_perf_event *hwc;
+ struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
+
+ for_each_set_bit(idx, cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS) {
+ if (ctr_ovf_mask & BIT(idx)) {
+ event = cpu_hw_evt->events[idx];
+ hwc = &event->hw;
+ max_period = riscv_pmu_ctr_get_width_mask(event);
+ init_val = local64_read(&hwc->prev_count) & max_period;
+ cpu_hw_evt->snapshot_cval_shcopy[idx] = init_val;
+ }
+ /*
+ * We do not need to update the non-overflow counters the previous
+ * value should have been there already.
+ */
+ }
+
+ for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++) {
+ /* Restore the counter values to relative indices for used hw counters */
+ for_each_set_bit(idx, &cpu_hw_evt->used_hw_ctrs[i], BITS_PER_LONG)
+ sdata->ctr_values[idx] =
+ cpu_hw_evt->snapshot_cval_shcopy[idx + i * BITS_PER_LONG];
+ /* Start all the counters in a single shot */
+ sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, idx * BITS_PER_LONG,
+ cpu_hw_evt->used_hw_ctrs[i], flag, 0, 0, 0);
+ }
+}
+
+static void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
+ u64 ctr_ovf_mask)
+{
+ struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+
+ if (sbi_pmu_snapshot_available())
+ pmu_sbi_start_ovf_ctrs_snapshot(cpu_hw_evt, ctr_ovf_mask);
+ else
+ pmu_sbi_start_ovf_ctrs_sbi(cpu_hw_evt, ctr_ovf_mask);
+}
+
static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
{
struct perf_sample_data data;
@@ -700,10 +903,11 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
int lidx, hidx, fidx;
struct riscv_pmu *pmu;
struct perf_event *event;
- unsigned long overflow;
- unsigned long overflowed_ctrs = 0;
+ u64 overflow;
+ u64 overflowed_ctrs = 0;
struct cpu_hw_events *cpu_hw_evt = dev;
u64 start_clock = sched_clock();
+ struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
if (WARN_ON_ONCE(!cpu_hw_evt))
return IRQ_NONE;
@@ -725,7 +929,10 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
pmu_sbi_stop_hw_ctrs(pmu);
/* Overflow status register should only be read after counter are stopped */
- ALT_SBI_PMU_OVERFLOW(overflow);
+ if (sbi_pmu_snapshot_available())
+ overflow = sdata->ctr_overflow_mask;
+ else
+ ALT_SBI_PMU_OVERFLOW(overflow);
/*
* Overflow interrupt pending bit should only be cleared after stopping
@@ -751,9 +958,14 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
if (!info || info->type != SBI_PMU_CTR_TYPE_HW)
continue;
- /* compute hardware counter index */
- hidx = info->csr - CSR_CYCLE;
- /* check if the corresponding bit is set in sscountovf */
+ if (sbi_pmu_snapshot_available())
+ /* SBI implementation already updated the logical indicies */
+ hidx = lidx;
+ else
+ /* compute hardware counter index */
+ hidx = info->csr - CSR_CYCLE;
+
+ /* check if the corresponding bit is set in sscountovf or overflow mask in shmem */
if (!(overflow & BIT(hidx)))
continue;
@@ -763,7 +975,10 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
*/
overflowed_ctrs |= BIT(lidx);
hw_evt = &event->hw;
+ /* Update the event states here so that we know the state while reading */
+ hw_evt->state |= PERF_HES_STOPPED;
riscv_pmu_event_update(event);
+ hw_evt->state |= PERF_HES_UPTODATE;
perf_sample_data_init(&data, 0, hw_evt->last_period);
if (riscv_pmu_event_set_period(event)) {
/*
@@ -776,6 +991,8 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
*/
perf_event_overflow(event, &data, regs);
}
+ /* Reset the state as we are going to start the counter after the loop */
+ hw_evt->state = 0;
}
pmu_sbi_start_overflow_mask(pmu, overflowed_ctrs);
@@ -807,6 +1024,9 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
enable_percpu_irq(riscv_pmu_irq, IRQ_TYPE_NONE);
}
+ if (sbi_pmu_snapshot_available())
+ return pmu_sbi_snapshot_setup(pmu, cpu);
+
return 0;
}
@@ -819,6 +1039,9 @@ static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node)
/* Disable all counters access for user mode now */
csr_write(CSR_SCOUNTEREN, 0x0);
+ if (sbi_pmu_snapshot_available())
+ return pmu_sbi_snapshot_disable();
+
return 0;
}
@@ -927,6 +1150,12 @@ static inline void riscv_pm_pmu_unregister(struct riscv_pmu *pmu) { }
static void riscv_pmu_destroy(struct riscv_pmu *pmu)
{
+ if (sbi_v2_available) {
+ if (sbi_pmu_snapshot_available()) {
+ pmu_sbi_snapshot_disable();
+ pmu_sbi_snapshot_free(pmu);
+ }
+ }
riscv_pm_pmu_unregister(pmu);
cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
}
@@ -1094,10 +1323,6 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
pmu->event_unmapped = pmu_sbi_event_unmapped;
pmu->csr_index = pmu_sbi_csr_index;
- ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
- if (ret)
- return ret;
-
ret = riscv_pm_pmu_register(pmu);
if (ret)
goto out_unregister;
@@ -1106,8 +1331,34 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
if (ret)
goto out_unregister;
+ /* SBI PMU Snapsphot is only available in SBI v2.0 */
+ if (sbi_v2_available) {
+ ret = pmu_sbi_snapshot_alloc(pmu);
+ if (ret)
+ goto out_unregister;
+
+ ret = pmu_sbi_snapshot_setup(pmu, smp_processor_id());
+ if (ret) {
+ /* Snapshot is an optional feature. Continue if not available */
+ pmu_sbi_snapshot_free(pmu);
+ } else {
+ pr_info("SBI PMU snapshot detected\n");
+ /*
+ * We enable it once here for the boot cpu. If snapshot shmem setup
+ * fails during cpu hotplug process, it will fail to start the cpu
+ * as we can not handle hetergenous PMUs with different snapshot
+ * capability.
+ */
+ static_branch_enable(&sbi_pmu_snapshot_available);
+ }
+ }
+
register_sysctl("kernel", sbi_pmu_sysctl_table);
+ ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
+ if (ret)
+ goto out_unregister;
+
return 0;
out_unregister:
@@ -1135,6 +1386,9 @@ static int __init pmu_sbi_devinit(void)
return 0;
}
+ if (sbi_spec_version >= sbi_mk_version(2, 0))
+ sbi_v2_available = true;
+
ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_RISCV_STARTING,
"perf/riscv/pmu:starting",
pmu_sbi_starting_cpu, pmu_sbi_dying_cpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 48f31dcd318a..afbc99264ffa 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -259,7 +259,6 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
union kvm_mmu_notifier_arg {
- pte_t pte;
unsigned long attributes;
};
@@ -273,7 +272,6 @@ struct kvm_gfn_range {
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
#endif
enum {
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index d93f6522b2c3..827ecc0b7e10 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -86,6 +86,7 @@ struct gfn_to_pfn_cache {
struct kvm_mmu_memory_cache {
gfp_t gfp_zero;
gfp_t gfp_custom;
+ u64 init_value;
struct kmem_cache *kmem_cache;
int capacity;
int nobjs;
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index f349e08a9dfe..d39ebb10caeb 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -123,15 +123,6 @@ struct mmu_notifier_ops {
unsigned long address);
/*
- * change_pte is called in cases that pte mapping to page is changed:
- * for example, when ksm remaps pte to point to a new shared page.
- */
- void (*change_pte)(struct mmu_notifier *subscription,
- struct mm_struct *mm,
- unsigned long address,
- pte_t pte);
-
- /*
* invalidate_range_start() and invalidate_range_end() must be
* paired and are called only when the mmap_lock and/or the
* locks protecting the reverse maps are held. If the subsystem
@@ -392,8 +383,6 @@ extern int __mmu_notifier_clear_young(struct mm_struct *mm,
unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address);
-extern void __mmu_notifier_change_pte(struct mm_struct *mm,
- unsigned long address, pte_t pte);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r);
extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
@@ -439,13 +428,6 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm,
return 0;
}
-static inline void mmu_notifier_change_pte(struct mm_struct *mm,
- unsigned long address, pte_t pte)
-{
- if (mm_has_notifiers(mm))
- __mmu_notifier_change_pte(mm, address, pte);
-}
-
static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
@@ -581,26 +563,6 @@ static inline void mmu_notifier_range_init_owner(
__young; \
})
-/*
- * set_pte_at_notify() sets the pte _after_ running the notifier.
- * This is safe to start by updating the secondary MMUs, because the primary MMU
- * pte invalidate must have already happened with a ptep_clear_flush() before
- * set_pte_at_notify() has been invoked. Updating the secondary MMUs first is
- * required when we change both the protection of the mapping from read-only to
- * read-write and the pfn (like during copy on write page faults). Otherwise the
- * old page would remain mapped readonly in the secondary MMUs after the new
- * page is already writable by some CPU through the primary MMU.
- */
-#define set_pte_at_notify(__mm, __address, __ptep, __pte) \
-({ \
- struct mm_struct *___mm = __mm; \
- unsigned long ___address = __address; \
- pte_t ___pte = __pte; \
- \
- mmu_notifier_change_pte(___mm, ___address, ___pte); \
- set_pte_at(___mm, ___address, __ptep, ___pte); \
-})
-
#else /* CONFIG_MMU_NOTIFIER */
struct mmu_notifier_range {
@@ -650,11 +612,6 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm,
return 0;
}
-static inline void mmu_notifier_change_pte(struct mm_struct *mm,
- unsigned long address, pte_t pte)
-{
-}
-
static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
@@ -693,7 +650,6 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
#define ptep_clear_flush_notify ptep_clear_flush
#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
-#define set_pte_at_notify set_pte_at
static inline void mmu_notifier_synchronize(void)
{
diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
index 43282e22ebe1..701974639ff2 100644
--- a/include/linux/perf/riscv_pmu.h
+++ b/include/linux/perf/riscv_pmu.h
@@ -39,6 +39,14 @@ struct cpu_hw_events {
DECLARE_BITMAP(used_hw_ctrs, RISCV_MAX_COUNTERS);
/* currently enabled firmware counters */
DECLARE_BITMAP(used_fw_ctrs, RISCV_MAX_COUNTERS);
+ /* The virtual address of the shared memory where counter snapshot will be taken */
+ void *snapshot_addr;
+ /* The physical address of the shared memory where counter snapshot will be taken */
+ phys_addr_t snapshot_addr_phys;
+ /* Boolean flag to indicate setup is already done */
+ bool snapshot_set_done;
+ /* A shadow copy of the counter values to avoid clobbering during multiple SBI calls */
+ u64 snapshot_cval_shcopy[RISCV_MAX_COUNTERS];
};
struct riscv_pmu {
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 011fba6b5552..74e40d5d4af4 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -456,21 +456,6 @@ TRACE_EVENT(kvm_unmap_hva_range,
__entry->start, __entry->end)
);
-TRACE_EVENT(kvm_set_spte_hva,
- TP_PROTO(unsigned long hva),
- TP_ARGS(hva),
-
- TP_STRUCT__entry(
- __field( unsigned long, hva )
- ),
-
- TP_fast_assign(
- __entry->hva = hva;
- ),
-
- TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
-);
-
TRACE_EVENT(kvm_age_hva,
TP_PROTO(unsigned long start, unsigned long end),
TP_ARGS(start, end),
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index e4834d23e1d1..1215bc299390 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -18,7 +18,7 @@
#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
-#include <linux/mmu_notifier.h> /* set_pte_at_notify */
+#include <linux/mmu_notifier.h>
#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
@@ -195,8 +195,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
ptep_clear_flush(vma, addr, pvmw.pte);
if (new_page)
- set_pte_at_notify(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
+ set_pte_at(mm, addr, pvmw.pte,
+ mk_pte(new_page, vma->vm_page_prot));
folio_remove_rmap_pte(old_folio, old_page, vma);
if (!folio_mapped(old_folio))
diff --git a/mm/ksm.c b/mm/ksm.c
index 8c001819cf10..108a4d167824 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1345,7 +1345,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (pte_write(entry))
entry = pte_wrprotect(entry);
- set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
+ set_pte_at(mm, pvmw.address, pvmw.pte, entry);
}
*orig_pte = entry;
err = 0;
@@ -1447,7 +1447,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
* See Documentation/mm/mmu_notifier.rst
*/
ptep_clear_flush(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, newpte);
+ set_pte_at(mm, addr, ptep, newpte);
folio = page_folio(page);
folio_remove_rmap_pte(folio, page, vma);
diff --git a/mm/memory.c b/mm/memory.c
index d2155ced45f8..0201f50d8307 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3329,13 +3329,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
ptep_clear_flush(vma, vmf->address, vmf->pte);
folio_add_new_anon_rmap(new_folio, vma, vmf->address);
folio_add_lru_vma(new_folio, vma);
- /*
- * We call the notify macro here because, when using secondary
- * mmu page tables (such as kvm shadow page tables), we want the
- * new page to be mapped directly into the secondary page table.
- */
BUG_ON(unshare && pte_write(entry));
- set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
+ set_pte_at(mm, vmf->address, vmf->pte, entry);
update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
if (old_folio) {
/*
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index b6c27c76e1a0..66206734b1b9 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -664,13 +664,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
if (flush) {
flush_cache_page(vma, addr, pte_pfn(orig_pte));
ptep_clear_flush(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, entry);
- update_mmu_cache(vma, addr, ptep);
- } else {
- /* No need to invalidate - it was non-present before */
- set_pte_at(mm, addr, ptep, entry);
- update_mmu_cache(vma, addr, ptep);
}
+ set_pte_at(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
pte_unmap_unlock(ptep, ptl);
*src = MIGRATE_PFN_MIGRATE;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index ec3b068cbbe6..8982e6139d07 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -424,23 +424,6 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
return young;
}
-void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
- pte_t pte)
-{
- struct mmu_notifier *subscription;
- int id;
-
- id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist,
- srcu_read_lock_held(&srcu)) {
- if (subscription->ops->change_pte)
- subscription->ops->change_pte(subscription, mm, address,
- pte);
- }
- srcu_read_unlock(&srcu, id);
-}
-
static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
const struct mmu_notifier_range *range)
{
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 741c7dc16afc..266c0fe85fef 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -120,6 +120,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test
TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
+TEST_GEN_PROGS_x86_64 += x86_64/sev_init2_tests
TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
TEST_GEN_PROGS_x86_64 += x86_64/sev_smoke_test
TEST_GEN_PROGS_x86_64 += x86_64/amx_test
@@ -189,6 +190,8 @@ TEST_GEN_PROGS_s390x += rseq_test
TEST_GEN_PROGS_s390x += set_memory_region_test
TEST_GEN_PROGS_s390x += kvm_binary_stats_test
+TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test
+TEST_GEN_PROGS_riscv += riscv/ebreak_test
TEST_GEN_PROGS_riscv += arch_timer
TEST_GEN_PROGS_riscv += demand_paging_test
TEST_GEN_PROGS_riscv += dirty_log_test
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index 3e0db283a46a..8acca8237687 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -93,7 +93,6 @@ enum kvm_mem_region_type {
struct kvm_vm {
int mode;
unsigned long type;
- uint8_t subtype;
int kvm_fd;
int fd;
unsigned int pgtable_levels;
@@ -200,8 +199,8 @@ enum vm_guest_mode {
struct vm_shape {
uint32_t type;
uint8_t mode;
- uint8_t subtype;
- uint16_t padding;
+ uint8_t pad0;
+ uint16_t pad1;
};
kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t));
@@ -890,17 +889,15 @@ static inline struct kvm_vm *vm_create_barebones(void)
return ____vm_create(VM_SHAPE_DEFAULT);
}
-#ifdef __x86_64__
-static inline struct kvm_vm *vm_create_barebones_protected_vm(void)
+static inline struct kvm_vm *vm_create_barebones_type(unsigned long type)
{
const struct vm_shape shape = {
.mode = VM_MODE_DEFAULT,
- .type = KVM_X86_SW_PROTECTED_VM,
+ .type = type,
};
return ____vm_create(shape);
}
-#endif
static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus)
{
diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h
index ce473fe251dd..5f389166338c 100644
--- a/tools/testing/selftests/kvm/include/riscv/processor.h
+++ b/tools/testing/selftests/kvm/include/riscv/processor.h
@@ -50,6 +50,16 @@ static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t subtype,
bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext);
+static inline bool __vcpu_has_isa_ext(struct kvm_vcpu *vcpu, uint64_t isa_ext)
+{
+ return __vcpu_has_ext(vcpu, RISCV_ISA_EXT_REG(isa_ext));
+}
+
+static inline bool __vcpu_has_sbi_ext(struct kvm_vcpu *vcpu, uint64_t sbi_ext)
+{
+ return __vcpu_has_ext(vcpu, RISCV_SBI_EXT_REG(sbi_ext));
+}
+
struct ex_regs {
unsigned long ra;
unsigned long sp;
@@ -154,45 +164,6 @@ void vm_install_interrupt_handler(struct kvm_vm *vm, exception_handler_fn handle
#define PGTBL_PAGE_SIZE PGTBL_L0_BLOCK_SIZE
#define PGTBL_PAGE_SIZE_SHIFT PGTBL_L0_BLOCK_SHIFT
-/* SBI return error codes */
-#define SBI_SUCCESS 0
-#define SBI_ERR_FAILURE -1
-#define SBI_ERR_NOT_SUPPORTED -2
-#define SBI_ERR_INVALID_PARAM -3
-#define SBI_ERR_DENIED -4
-#define SBI_ERR_INVALID_ADDRESS -5
-#define SBI_ERR_ALREADY_AVAILABLE -6
-#define SBI_ERR_ALREADY_STARTED -7
-#define SBI_ERR_ALREADY_STOPPED -8
-
-#define SBI_EXT_EXPERIMENTAL_START 0x08000000
-#define SBI_EXT_EXPERIMENTAL_END 0x08FFFFFF
-
-#define KVM_RISCV_SELFTESTS_SBI_EXT SBI_EXT_EXPERIMENTAL_END
-#define KVM_RISCV_SELFTESTS_SBI_UCALL 0
-#define KVM_RISCV_SELFTESTS_SBI_UNEXP 1
-
-enum sbi_ext_id {
- SBI_EXT_BASE = 0x10,
- SBI_EXT_STA = 0x535441,
-};
-
-enum sbi_ext_base_fid {
- SBI_EXT_BASE_PROBE_EXT = 3,
-};
-
-struct sbiret {
- long error;
- long value;
-};
-
-struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
- unsigned long arg1, unsigned long arg2,
- unsigned long arg3, unsigned long arg4,
- unsigned long arg5);
-
-bool guest_sbi_probe_extension(int extid, long *out_val);
-
static inline void local_irq_enable(void)
{
csr_set(CSR_SSTATUS, SR_SIE);
diff --git a/tools/testing/selftests/kvm/include/riscv/sbi.h b/tools/testing/selftests/kvm/include/riscv/sbi.h
new file mode 100644
index 000000000000..046b432ae896
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/riscv/sbi.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RISC-V SBI specific definitions
+ *
+ * Copyright (C) 2024 Rivos Inc.
+ */
+
+#ifndef SELFTEST_KVM_SBI_H
+#define SELFTEST_KVM_SBI_H
+
+/* SBI spec version fields */
+#define SBI_SPEC_VERSION_DEFAULT 0x1
+#define SBI_SPEC_VERSION_MAJOR_SHIFT 24
+#define SBI_SPEC_VERSION_MAJOR_MASK 0x7f
+#define SBI_SPEC_VERSION_MINOR_MASK 0xffffff
+
+/* SBI return error codes */
+#define SBI_SUCCESS 0
+#define SBI_ERR_FAILURE -1
+#define SBI_ERR_NOT_SUPPORTED -2
+#define SBI_ERR_INVALID_PARAM -3
+#define SBI_ERR_DENIED -4
+#define SBI_ERR_INVALID_ADDRESS -5
+#define SBI_ERR_ALREADY_AVAILABLE -6
+#define SBI_ERR_ALREADY_STARTED -7
+#define SBI_ERR_ALREADY_STOPPED -8
+
+#define SBI_EXT_EXPERIMENTAL_START 0x08000000
+#define SBI_EXT_EXPERIMENTAL_END 0x08FFFFFF
+
+#define KVM_RISCV_SELFTESTS_SBI_EXT SBI_EXT_EXPERIMENTAL_END
+#define KVM_RISCV_SELFTESTS_SBI_UCALL 0
+#define KVM_RISCV_SELFTESTS_SBI_UNEXP 1
+
+enum sbi_ext_id {
+ SBI_EXT_BASE = 0x10,
+ SBI_EXT_STA = 0x535441,
+ SBI_EXT_PMU = 0x504D55,
+};
+
+enum sbi_ext_base_fid {
+ SBI_EXT_BASE_GET_SPEC_VERSION = 0,
+ SBI_EXT_BASE_GET_IMP_ID,
+ SBI_EXT_BASE_GET_IMP_VERSION,
+ SBI_EXT_BASE_PROBE_EXT = 3,
+};
+enum sbi_ext_pmu_fid {
+ SBI_EXT_PMU_NUM_COUNTERS = 0,
+ SBI_EXT_PMU_COUNTER_GET_INFO,
+ SBI_EXT_PMU_COUNTER_CFG_MATCH,
+ SBI_EXT_PMU_COUNTER_START,
+ SBI_EXT_PMU_COUNTER_STOP,
+ SBI_EXT_PMU_COUNTER_FW_READ,
+ SBI_EXT_PMU_COUNTER_FW_READ_HI,
+ SBI_EXT_PMU_SNAPSHOT_SET_SHMEM,
+};
+
+union sbi_pmu_ctr_info {
+ unsigned long value;
+ struct {
+ unsigned long csr:12;
+ unsigned long width:6;
+#if __riscv_xlen == 32
+ unsigned long reserved:13;
+#else
+ unsigned long reserved:45;
+#endif
+ unsigned long type:1;
+ };
+};
+
+struct riscv_pmu_snapshot_data {
+ u64 ctr_overflow_mask;
+ u64 ctr_values[64];
+ u64 reserved[447];
+};
+
+struct sbiret {
+ long error;
+ long value;
+};
+
+/** General pmu event codes specified in SBI PMU extension */
+enum sbi_pmu_hw_generic_events_t {
+ SBI_PMU_HW_NO_EVENT = 0,
+ SBI_PMU_HW_CPU_CYCLES = 1,
+ SBI_PMU_HW_INSTRUCTIONS = 2,
+ SBI_PMU_HW_CACHE_REFERENCES = 3,
+ SBI_PMU_HW_CACHE_MISSES = 4,
+ SBI_PMU_HW_BRANCH_INSTRUCTIONS = 5,
+ SBI_PMU_HW_BRANCH_MISSES = 6,
+ SBI_PMU_HW_BUS_CYCLES = 7,
+ SBI_PMU_HW_STALLED_CYCLES_FRONTEND = 8,
+ SBI_PMU_HW_STALLED_CYCLES_BACKEND = 9,
+ SBI_PMU_HW_REF_CPU_CYCLES = 10,
+
+ SBI_PMU_HW_GENERAL_MAX,
+};
+
+/* SBI PMU counter types */
+enum sbi_pmu_ctr_type {
+ SBI_PMU_CTR_TYPE_HW = 0x0,
+ SBI_PMU_CTR_TYPE_FW,
+};
+
+/* Flags defined for config matching function */
+#define SBI_PMU_CFG_FLAG_SKIP_MATCH BIT(0)
+#define SBI_PMU_CFG_FLAG_CLEAR_VALUE BIT(1)
+#define SBI_PMU_CFG_FLAG_AUTO_START BIT(2)
+#define SBI_PMU_CFG_FLAG_SET_VUINH BIT(3)
+#define SBI_PMU_CFG_FLAG_SET_VSINH BIT(4)
+#define SBI_PMU_CFG_FLAG_SET_UINH BIT(5)
+#define SBI_PMU_CFG_FLAG_SET_SINH BIT(6)
+#define SBI_PMU_CFG_FLAG_SET_MINH BIT(7)
+
+/* Flags defined for counter start function */
+#define SBI_PMU_START_FLAG_SET_INIT_VALUE BIT(0)
+#define SBI_PMU_START_FLAG_INIT_SNAPSHOT BIT(1)
+
+/* Flags defined for counter stop function */
+#define SBI_PMU_STOP_FLAG_RESET BIT(0)
+#define SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT BIT(1)
+
+struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
+ unsigned long arg1, unsigned long arg2,
+ unsigned long arg3, unsigned long arg4,
+ unsigned long arg5);
+
+bool guest_sbi_probe_extension(int extid, long *out_val);
+
+/* Make SBI version */
+static inline unsigned long sbi_mk_version(unsigned long major,
+ unsigned long minor)
+{
+ return ((major & SBI_SPEC_VERSION_MAJOR_MASK) << SBI_SPEC_VERSION_MAJOR_SHIFT)
+ | (minor & SBI_SPEC_VERSION_MINOR_MASK);
+}
+
+unsigned long get_host_sbi_spec_version(void);
+
+#endif /* SELFTEST_KVM_SBI_H */
diff --git a/tools/testing/selftests/kvm/include/riscv/ucall.h b/tools/testing/selftests/kvm/include/riscv/ucall.h
index be46eb32ec27..a695ae36f3e0 100644
--- a/tools/testing/selftests/kvm/include/riscv/ucall.h
+++ b/tools/testing/selftests/kvm/include/riscv/ucall.h
@@ -3,6 +3,7 @@
#define SELFTEST_KVM_UCALL_H
#include "processor.h"
+#include "sbi.h"
#define UCALL_EXIT_REASON KVM_EXIT_RISCV_SBI
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 81ce37ec407d..74a59c7ce7ed 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -23,12 +23,6 @@
extern bool host_cpu_is_intel;
extern bool host_cpu_is_amd;
-enum vm_guest_x86_subtype {
- VM_SUBTYPE_NONE = 0,
- VM_SUBTYPE_SEV,
- VM_SUBTYPE_SEV_ES,
-};
-
/* Forced emulation prefix, used to invoke the emulator unconditionally. */
#define KVM_FEP "ud2; .byte 'k', 'v', 'm';"
diff --git a/tools/testing/selftests/kvm/include/x86_64/sev.h b/tools/testing/selftests/kvm/include/x86_64/sev.h
index 8a1bf88474c9..82c11c81a956 100644
--- a/tools/testing/selftests/kvm/include/x86_64/sev.h
+++ b/tools/testing/selftests/kvm/include/x86_64/sev.h
@@ -31,8 +31,9 @@ void sev_vm_launch(struct kvm_vm *vm, uint32_t policy);
void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement);
void sev_vm_launch_finish(struct kvm_vm *vm);
-struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code,
+struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code,
struct kvm_vcpu **cpu);
+void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement);
kvm_static_assert(SEV_RET_SUCCESS == 0);
@@ -67,20 +68,8 @@ kvm_static_assert(SEV_RET_SUCCESS == 0);
__TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \
})
-static inline void sev_vm_init(struct kvm_vm *vm)
-{
- vm->arch.sev_fd = open_sev_dev_path_or_exit();
-
- vm_sev_ioctl(vm, KVM_SEV_INIT, NULL);
-}
-
-
-static inline void sev_es_vm_init(struct kvm_vm *vm)
-{
- vm->arch.sev_fd = open_sev_dev_path_or_exit();
-
- vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL);
-}
+void sev_vm_init(struct kvm_vm *vm);
+void sev_es_vm_init(struct kvm_vm *vm);
static inline void sev_register_encrypted_memory(struct kvm_vm *vm,
struct userspace_mem_region *region)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index b2262b5fad9e..9da388100f3a 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -276,7 +276,6 @@ struct kvm_vm *____vm_create(struct vm_shape shape)
vm->mode = shape.mode;
vm->type = shape.type;
- vm->subtype = shape.subtype;
vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits;
vm->va_bits = vm_guest_mode_params[vm->mode].va_bits;
diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c
index e8211f5d6863..ccb35573749c 100644
--- a/tools/testing/selftests/kvm/lib/riscv/processor.c
+++ b/tools/testing/selftests/kvm/lib/riscv/processor.c
@@ -502,3 +502,15 @@ bool guest_sbi_probe_extension(int extid, long *out_val)
return true;
}
+
+unsigned long get_host_sbi_spec_version(void)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_BASE, SBI_EXT_BASE_GET_SPEC_VERSION, 0,
+ 0, 0, 0, 0, 0);
+
+ GUEST_ASSERT(!ret.error);
+
+ return ret.value;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 74a4c736c9ae..9f87ca8b7ab6 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -578,10 +578,11 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm)
sync_global_to_guest(vm, host_cpu_is_intel);
sync_global_to_guest(vm, host_cpu_is_amd);
- if (vm->subtype == VM_SUBTYPE_SEV)
- sev_vm_init(vm);
- else if (vm->subtype == VM_SUBTYPE_SEV_ES)
- sev_es_vm_init(vm);
+ if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
+ struct kvm_sev_init init = { 0 };
+
+ vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
+ }
}
void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
@@ -1081,9 +1082,12 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
void kvm_init_vm_address_properties(struct kvm_vm *vm)
{
- if (vm->subtype == VM_SUBTYPE_SEV || vm->subtype == VM_SUBTYPE_SEV_ES) {
+ if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
+ vm->arch.sev_fd = open_sev_dev_path_or_exit();
vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT));
vm->gpa_tag_mask = vm->arch.c_bit;
+ } else {
+ vm->arch.sev_fd = -1;
}
}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/sev.c b/tools/testing/selftests/kvm/lib/x86_64/sev.c
index e248d3364b9c..d482029b6004 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/sev.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/sev.c
@@ -35,6 +35,32 @@ static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *regio
}
}
+void sev_vm_init(struct kvm_vm *vm)
+{
+ if (vm->type == KVM_X86_DEFAULT_VM) {
+ assert(vm->arch.sev_fd == -1);
+ vm->arch.sev_fd = open_sev_dev_path_or_exit();
+ vm_sev_ioctl(vm, KVM_SEV_INIT, NULL);
+ } else {
+ struct kvm_sev_init init = { 0 };
+ assert(vm->type == KVM_X86_SEV_VM);
+ vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
+ }
+}
+
+void sev_es_vm_init(struct kvm_vm *vm)
+{
+ if (vm->type == KVM_X86_DEFAULT_VM) {
+ assert(vm->arch.sev_fd == -1);
+ vm->arch.sev_fd = open_sev_dev_path_or_exit();
+ vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL);
+ } else {
+ struct kvm_sev_init init = { 0 };
+ assert(vm->type == KVM_X86_SEV_ES_VM);
+ vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
+ }
+}
+
void sev_vm_launch(struct kvm_vm *vm, uint32_t policy)
{
struct kvm_sev_launch_start launch_start = {
@@ -87,28 +113,30 @@ void sev_vm_launch_finish(struct kvm_vm *vm)
TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING);
}
-struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code,
+struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code,
struct kvm_vcpu **cpu)
{
struct vm_shape shape = {
- .type = VM_TYPE_DEFAULT,
.mode = VM_MODE_DEFAULT,
- .subtype = policy & SEV_POLICY_ES ? VM_SUBTYPE_SEV_ES :
- VM_SUBTYPE_SEV,
+ .type = type,
};
struct kvm_vm *vm;
struct kvm_vcpu *cpus[1];
- uint8_t measurement[512];
vm = __vm_create_with_vcpus(shape, 1, 0, guest_code, cpus);
*cpu = cpus[0];
+ return vm;
+}
+
+void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement)
+{
sev_vm_launch(vm, policy);
- /* TODO: Validate the measurement is as expected. */
+ if (!measurement)
+ measurement = alloca(256);
+
sev_vm_launch_measure(vm, measurement);
sev_vm_launch_finish(vm);
-
- return vm;
}
diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c
index 0f9cabd99fd4..735b78569021 100644
--- a/tools/testing/selftests/kvm/riscv/arch_timer.c
+++ b/tools/testing/selftests/kvm/riscv/arch_timer.c
@@ -85,7 +85,7 @@ struct kvm_vm *test_vm_create(void)
int nr_vcpus = test_args.nr_vcpus;
vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
- __TEST_REQUIRE(__vcpu_has_ext(vcpus[0], RISCV_ISA_EXT_REG(KVM_RISCV_ISA_EXT_SSTC)),
+ __TEST_REQUIRE(__vcpu_has_isa_ext(vcpus[0], KVM_RISCV_ISA_EXT_SSTC),
"SSTC not available, skipping test\n");
vm_init_vector_tables(vm);
diff --git a/tools/testing/selftests/kvm/riscv/ebreak_test.c b/tools/testing/selftests/kvm/riscv/ebreak_test.c
new file mode 100644
index 000000000000..823c132069b4
--- /dev/null
+++ b/tools/testing/selftests/kvm/riscv/ebreak_test.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * RISC-V KVM ebreak test.
+ *
+ * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd.
+ *
+ */
+#include "kvm_util.h"
+
+#define LABEL_ADDRESS(v) ((uint64_t)&(v))
+
+extern unsigned char sw_bp_1, sw_bp_2;
+static uint64_t sw_bp_addr;
+
+static void guest_code(void)
+{
+ asm volatile(
+ ".option push\n"
+ ".option norvc\n"
+ "sw_bp_1: ebreak\n"
+ "sw_bp_2: ebreak\n"
+ ".option pop\n"
+ );
+ GUEST_ASSERT_EQ(READ_ONCE(sw_bp_addr), LABEL_ADDRESS(sw_bp_2));
+
+ GUEST_DONE();
+}
+
+static void guest_breakpoint_handler(struct ex_regs *regs)
+{
+ WRITE_ONCE(sw_bp_addr, regs->epc);
+ regs->epc += 4;
+}
+
+int main(void)
+{
+ struct kvm_vm *vm;
+ struct kvm_vcpu *vcpu;
+ uint64_t pc;
+ struct kvm_guest_debug debug = {
+ .control = KVM_GUESTDBG_ENABLE,
+ };
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG));
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+ vm_init_vector_tables(vm);
+ vcpu_init_vector_tables(vcpu);
+ vm_install_exception_handler(vm, EXC_BREAKPOINT,
+ guest_breakpoint_handler);
+
+ /*
+ * Enable the guest debug.
+ * ebreak should exit to the VMM with KVM_EXIT_DEBUG reason.
+ */
+ vcpu_guest_debug_set(vcpu, &debug);
+ vcpu_run(vcpu);
+
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG);
+
+ vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc), &pc);
+ TEST_ASSERT_EQ(pc, LABEL_ADDRESS(sw_bp_1));
+
+ /* skip sw_bp_1 */
+ vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), pc + 4);
+
+ /*
+ * Disable all debug controls.
+ * Guest should handle the ebreak without exiting to the VMM.
+ */
+ memset(&debug, 0, sizeof(debug));
+ vcpu_guest_debug_set(vcpu, &debug);
+
+ vcpu_run(vcpu);
+
+ TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c
index b882b7b9b785..222198dd6d04 100644
--- a/tools/testing/selftests/kvm/riscv/get-reg-list.c
+++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c
@@ -43,6 +43,7 @@ bool filter_reg(__u64 reg)
case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_V:
case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SMSTATEEN:
case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSAIA:
+ case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSCOFPMF:
case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSTC:
case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVINVAL:
case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVNAPOT:
@@ -408,6 +409,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off)
KVM_ISA_EXT_ARR(V),
KVM_ISA_EXT_ARR(SMSTATEEN),
KVM_ISA_EXT_ARR(SSAIA),
+ KVM_ISA_EXT_ARR(SSCOFPMF),
KVM_ISA_EXT_ARR(SSTC),
KVM_ISA_EXT_ARR(SVINVAL),
KVM_ISA_EXT_ARR(SVNAPOT),
@@ -931,6 +933,7 @@ KVM_ISA_EXT_SUBLIST_CONFIG(fp_f, FP_F);
KVM_ISA_EXT_SUBLIST_CONFIG(fp_d, FP_D);
KVM_ISA_EXT_SIMPLE_CONFIG(h, H);
KVM_ISA_EXT_SUBLIST_CONFIG(smstateen, SMSTATEEN);
+KVM_ISA_EXT_SIMPLE_CONFIG(sscofpmf, SSCOFPMF);
KVM_ISA_EXT_SIMPLE_CONFIG(sstc, SSTC);
KVM_ISA_EXT_SIMPLE_CONFIG(svinval, SVINVAL);
KVM_ISA_EXT_SIMPLE_CONFIG(svnapot, SVNAPOT);
@@ -986,6 +989,7 @@ struct vcpu_reg_list *vcpu_configs[] = {
&config_fp_d,
&config_h,
&config_smstateen,
+ &config_sscofpmf,
&config_sstc,
&config_svinval,
&config_svnapot,
diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c
new file mode 100644
index 000000000000..69bb94e6b227
--- /dev/null
+++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c
@@ -0,0 +1,681 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sbi_pmu_test.c - Tests the riscv64 SBI PMU functionality.
+ *
+ * Copyright (c) 2024, Rivos Inc.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include "kvm_util.h"
+#include "test_util.h"
+#include "processor.h"
+#include "sbi.h"
+#include "arch_timer.h"
+
+/* Maximum counters(firmware + hardware) */
+#define RISCV_MAX_PMU_COUNTERS 64
+union sbi_pmu_ctr_info ctrinfo_arr[RISCV_MAX_PMU_COUNTERS];
+
+/* Snapshot shared memory data */
+#define PMU_SNAPSHOT_GPA_BASE BIT(30)
+static void *snapshot_gva;
+static vm_paddr_t snapshot_gpa;
+
+static int vcpu_shared_irq_count;
+static int counter_in_use;
+
+/* Cache the available counters in a bitmask */
+static unsigned long counter_mask_available;
+
+static bool illegal_handler_invoked;
+
+#define SBI_PMU_TEST_BASIC BIT(0)
+#define SBI_PMU_TEST_EVENTS BIT(1)
+#define SBI_PMU_TEST_SNAPSHOT BIT(2)
+#define SBI_PMU_TEST_OVERFLOW BIT(3)
+
+static int disabled_tests;
+
+unsigned long pmu_csr_read_num(int csr_num)
+{
+#define switchcase_csr_read(__csr_num, __val) {\
+ case __csr_num: \
+ __val = csr_read(__csr_num); \
+ break; }
+#define switchcase_csr_read_2(__csr_num, __val) {\
+ switchcase_csr_read(__csr_num + 0, __val) \
+ switchcase_csr_read(__csr_num + 1, __val)}
+#define switchcase_csr_read_4(__csr_num, __val) {\
+ switchcase_csr_read_2(__csr_num + 0, __val) \
+ switchcase_csr_read_2(__csr_num + 2, __val)}
+#define switchcase_csr_read_8(__csr_num, __val) {\
+ switchcase_csr_read_4(__csr_num + 0, __val) \
+ switchcase_csr_read_4(__csr_num + 4, __val)}
+#define switchcase_csr_read_16(__csr_num, __val) {\
+ switchcase_csr_read_8(__csr_num + 0, __val) \
+ switchcase_csr_read_8(__csr_num + 8, __val)}
+#define switchcase_csr_read_32(__csr_num, __val) {\
+ switchcase_csr_read_16(__csr_num + 0, __val) \
+ switchcase_csr_read_16(__csr_num + 16, __val)}
+
+ unsigned long ret = 0;
+
+ switch (csr_num) {
+ switchcase_csr_read_32(CSR_CYCLE, ret)
+ switchcase_csr_read_32(CSR_CYCLEH, ret)
+ default :
+ break;
+ }
+
+ return ret;
+#undef switchcase_csr_read_32
+#undef switchcase_csr_read_16
+#undef switchcase_csr_read_8
+#undef switchcase_csr_read_4
+#undef switchcase_csr_read_2
+#undef switchcase_csr_read
+}
+
+static inline void dummy_func_loop(uint64_t iter)
+{
+ int i = 0;
+
+ while (i < iter) {
+ asm volatile("nop");
+ i++;
+ }
+}
+
+static void start_counter(unsigned long counter, unsigned long start_flags,
+ unsigned long ival)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, counter, 1, start_flags,
+ ival, 0, 0);
+ __GUEST_ASSERT(ret.error == 0, "Unable to start counter %ld\n", counter);
+}
+
+/* This should be invoked only for reset counter use case */
+static void stop_reset_counter(unsigned long counter, unsigned long stop_flags)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1,
+ stop_flags | SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
+ __GUEST_ASSERT(ret.error == SBI_ERR_ALREADY_STOPPED,
+ "Unable to stop counter %ld\n", counter);
+}
+
+static void stop_counter(unsigned long counter, unsigned long stop_flags)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, stop_flags,
+ 0, 0, 0);
+ __GUEST_ASSERT(ret.error == 0, "Unable to stop counter %ld error %ld\n",
+ counter, ret.error);
+}
+
+static void guest_illegal_exception_handler(struct ex_regs *regs)
+{
+ __GUEST_ASSERT(regs->cause == EXC_INST_ILLEGAL,
+ "Unexpected exception handler %lx\n", regs->cause);
+
+ illegal_handler_invoked = true;
+ /* skip the trapping instruction */
+ regs->epc += 4;
+}
+
+static void guest_irq_handler(struct ex_regs *regs)
+{
+ unsigned int irq_num = regs->cause & ~CAUSE_IRQ_FLAG;
+ struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva;
+ unsigned long overflown_mask;
+ unsigned long counter_val = 0;
+
+ /* Validate that we are in the correct irq handler */
+ GUEST_ASSERT_EQ(irq_num, IRQ_PMU_OVF);
+
+ /* Stop all counters first to avoid further interrupts */
+ stop_counter(counter_in_use, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT);
+
+ csr_clear(CSR_SIP, BIT(IRQ_PMU_OVF));
+
+ overflown_mask = READ_ONCE(snapshot_data->ctr_overflow_mask);
+ GUEST_ASSERT(overflown_mask & 0x01);
+
+ WRITE_ONCE(vcpu_shared_irq_count, vcpu_shared_irq_count+1);
+
+ counter_val = READ_ONCE(snapshot_data->ctr_values[0]);
+ /* Now start the counter to mimick the real driver behavior */
+ start_counter(counter_in_use, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_val);
+}
+
+static unsigned long get_counter_index(unsigned long cbase, unsigned long cmask,
+ unsigned long cflags,
+ unsigned long event)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase, cmask,
+ cflags, event, 0, 0);
+ __GUEST_ASSERT(ret.error == 0, "config matching failed %ld\n", ret.error);
+ GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS);
+ GUEST_ASSERT(BIT(ret.value) & counter_mask_available);
+
+ return ret.value;
+}
+
+static unsigned long get_num_counters(void)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_NUM_COUNTERS, 0, 0, 0, 0, 0, 0);
+
+ __GUEST_ASSERT(ret.error == 0, "Unable to retrieve number of counters from SBI PMU");
+ __GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS,
+ "Invalid number of counters %ld\n", ret.value);
+
+ return ret.value;
+}
+
+static void update_counter_info(int num_counters)
+{
+ int i = 0;
+ struct sbiret ret;
+
+ for (i = 0; i < num_counters; i++) {
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, 0, 0, 0, 0, 0);
+
+ /* There can be gaps in logical counter indicies*/
+ if (ret.error)
+ continue;
+ GUEST_ASSERT_NE(ret.value, 0);
+
+ ctrinfo_arr[i].value = ret.value;
+ counter_mask_available |= BIT(i);
+ }
+
+ GUEST_ASSERT(counter_mask_available > 0);
+}
+
+static unsigned long read_fw_counter(int idx, union sbi_pmu_ctr_info ctrinfo)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ, idx, 0, 0, 0, 0, 0);
+ GUEST_ASSERT(ret.error == 0);
+ return ret.value;
+}
+
+static unsigned long read_counter(int idx, union sbi_pmu_ctr_info ctrinfo)
+{
+ unsigned long counter_val = 0;
+
+ __GUEST_ASSERT(ctrinfo.type < 2, "Invalid counter type %d", ctrinfo.type);
+
+ if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW)
+ counter_val = pmu_csr_read_num(ctrinfo.csr);
+ else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW)
+ counter_val = read_fw_counter(idx, ctrinfo);
+
+ return counter_val;
+}
+
+static inline void verify_sbi_requirement_assert(void)
+{
+ long out_val = 0;
+ bool probe;
+
+ probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val);
+ GUEST_ASSERT(probe && out_val == 1);
+
+ if (get_host_sbi_spec_version() < sbi_mk_version(2, 0))
+ __GUEST_ASSERT(0, "SBI implementation version doesn't support PMU Snapshot");
+}
+
+static void snapshot_set_shmem(vm_paddr_t gpa, unsigned long flags)
+{
+ unsigned long lo = (unsigned long)gpa;
+#if __riscv_xlen == 32
+ unsigned long hi = (unsigned long)(gpa >> 32);
+#else
+ unsigned long hi = gpa == -1 ? -1 : 0;
+#endif
+ struct sbiret ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM,
+ lo, hi, flags, 0, 0, 0);
+
+ GUEST_ASSERT(ret.value == 0 && ret.error == 0);
+}
+
+static void test_pmu_event(unsigned long event)
+{
+ unsigned long counter;
+ unsigned long counter_value_pre, counter_value_post;
+ unsigned long counter_init_value = 100;
+
+ counter = get_counter_index(0, counter_mask_available, 0, event);
+ counter_value_pre = read_counter(counter, ctrinfo_arr[counter]);
+
+ /* Do not set the initial value */
+ start_counter(counter, 0, 0);
+ dummy_func_loop(10000);
+ stop_counter(counter, 0);
+
+ counter_value_post = read_counter(counter, ctrinfo_arr[counter]);
+ __GUEST_ASSERT(counter_value_post > counter_value_pre,
+ "Event update verification failed: post [%lx] pre [%lx]\n",
+ counter_value_post, counter_value_pre);
+
+ /*
+ * We can't just update the counter without starting it.
+ * Do start/stop twice to simulate that by first initializing to a very
+ * high value and a low value after that.
+ */
+ start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, ULONG_MAX/2);
+ stop_counter(counter, 0);
+ counter_value_pre = read_counter(counter, ctrinfo_arr[counter]);
+
+ start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value);
+ stop_counter(counter, 0);
+ counter_value_post = read_counter(counter, ctrinfo_arr[counter]);
+ __GUEST_ASSERT(counter_value_pre > counter_value_post,
+ "Counter reinitialization verification failed : post [%lx] pre [%lx]\n",
+ counter_value_post, counter_value_pre);
+
+ /* Now set the initial value and compare */
+ start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value);
+ dummy_func_loop(10000);
+ stop_counter(counter, 0);
+
+ counter_value_post = read_counter(counter, ctrinfo_arr[counter]);
+ __GUEST_ASSERT(counter_value_post > counter_init_value,
+ "Event update verification failed: post [%lx] pre [%lx]\n",
+ counter_value_post, counter_init_value);
+
+ stop_reset_counter(counter, 0);
+}
+
+static void test_pmu_event_snapshot(unsigned long event)
+{
+ unsigned long counter;
+ unsigned long counter_value_pre, counter_value_post;
+ unsigned long counter_init_value = 100;
+ struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva;
+
+ counter = get_counter_index(0, counter_mask_available, 0, event);
+ counter_value_pre = read_counter(counter, ctrinfo_arr[counter]);
+
+ /* Do not set the initial value */
+ start_counter(counter, 0, 0);
+ dummy_func_loop(10000);
+ stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT);
+
+ /* The counter value is updated w.r.t relative index of cbase */
+ counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]);
+ __GUEST_ASSERT(counter_value_post > counter_value_pre,
+ "Event update verification failed: post [%lx] pre [%lx]\n",
+ counter_value_post, counter_value_pre);
+
+ /*
+ * We can't just update the counter without starting it.
+ * Do start/stop twice to simulate that by first initializing to a very
+ * high value and a low value after that.
+ */
+ WRITE_ONCE(snapshot_data->ctr_values[0], ULONG_MAX/2);
+ start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0);
+ stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT);
+ counter_value_pre = READ_ONCE(snapshot_data->ctr_values[0]);
+
+ WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value);
+ start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0);
+ stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT);
+ counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]);
+ __GUEST_ASSERT(counter_value_pre > counter_value_post,
+ "Counter reinitialization verification failed : post [%lx] pre [%lx]\n",
+ counter_value_post, counter_value_pre);
+
+ /* Now set the initial value and compare */
+ WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value);
+ start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0);
+ dummy_func_loop(10000);
+ stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT);
+
+ counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]);
+ __GUEST_ASSERT(counter_value_post > counter_init_value,
+ "Event update verification failed: post [%lx] pre [%lx]\n",
+ counter_value_post, counter_init_value);
+
+ stop_reset_counter(counter, 0);
+}
+
+static void test_pmu_event_overflow(unsigned long event)
+{
+ unsigned long counter;
+ unsigned long counter_value_post;
+ unsigned long counter_init_value = ULONG_MAX - 10000;
+ struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva;
+
+ counter = get_counter_index(0, counter_mask_available, 0, event);
+ counter_in_use = counter;
+
+ /* The counter value is updated w.r.t relative index of cbase passed to start/stop */
+ WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value);
+ start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0);
+ dummy_func_loop(10000);
+ udelay(msecs_to_usecs(2000));
+ /* irq handler should have stopped the counter */
+ stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT);
+
+ counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]);
+ /* The counter value after stopping should be less the init value due to overflow */
+ __GUEST_ASSERT(counter_value_post < counter_init_value,
+ "counter_value_post %lx counter_init_value %lx for counter\n",
+ counter_value_post, counter_init_value);
+
+ stop_reset_counter(counter, 0);
+}
+
+static void test_invalid_event(void)
+{
+ struct sbiret ret;
+ unsigned long event = 0x1234; /* A random event */
+
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, 0,
+ counter_mask_available, 0, event, 0, 0);
+ GUEST_ASSERT_EQ(ret.error, SBI_ERR_NOT_SUPPORTED);
+}
+
+static void test_pmu_events(void)
+{
+ int num_counters = 0;
+
+ /* Get the counter details */
+ num_counters = get_num_counters();
+ update_counter_info(num_counters);
+
+ /* Sanity testing for any random invalid event */
+ test_invalid_event();
+
+ /* Only these two events are guaranteed to be present */
+ test_pmu_event(SBI_PMU_HW_CPU_CYCLES);
+ test_pmu_event(SBI_PMU_HW_INSTRUCTIONS);
+
+ GUEST_DONE();
+}
+
+static void test_pmu_basic_sanity(void)
+{
+ long out_val = 0;
+ bool probe;
+ struct sbiret ret;
+ int num_counters = 0, i;
+ union sbi_pmu_ctr_info ctrinfo;
+
+ probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val);
+ GUEST_ASSERT(probe && out_val == 1);
+
+ num_counters = get_num_counters();
+
+ for (i = 0; i < num_counters; i++) {
+ ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i,
+ 0, 0, 0, 0, 0);
+
+ /* There can be gaps in logical counter indicies*/
+ if (ret.error)
+ continue;
+ GUEST_ASSERT_NE(ret.value, 0);
+
+ ctrinfo.value = ret.value;
+
+ /**
+ * Accessibility check of hardware and read capability of firmware counters.
+ * The spec doesn't mandate any initial value. No need to check any value.
+ */
+ if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW) {
+ pmu_csr_read_num(ctrinfo.csr);
+ GUEST_ASSERT(illegal_handler_invoked);
+ } else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) {
+ read_fw_counter(i, ctrinfo);
+ }
+ }
+
+ GUEST_DONE();
+}
+
+static void test_pmu_events_snaphost(void)
+{
+ int num_counters = 0;
+ struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva;
+ int i;
+
+ /* Verify presence of SBI PMU and minimum requrired SBI version */
+ verify_sbi_requirement_assert();
+
+ snapshot_set_shmem(snapshot_gpa, 0);
+
+ /* Get the counter details */
+ num_counters = get_num_counters();
+ update_counter_info(num_counters);
+
+ /* Validate shared memory access */
+ GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_overflow_mask), 0);
+ for (i = 0; i < num_counters; i++) {
+ if (counter_mask_available & (BIT(i)))
+ GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_values[i]), 0);
+ }
+ /* Only these two events are guranteed to be present */
+ test_pmu_event_snapshot(SBI_PMU_HW_CPU_CYCLES);
+ test_pmu_event_snapshot(SBI_PMU_HW_INSTRUCTIONS);
+
+ GUEST_DONE();
+}
+
+static void test_pmu_events_overflow(void)
+{
+ int num_counters = 0;
+
+ /* Verify presence of SBI PMU and minimum requrired SBI version */
+ verify_sbi_requirement_assert();
+
+ snapshot_set_shmem(snapshot_gpa, 0);
+ csr_set(CSR_IE, BIT(IRQ_PMU_OVF));
+ local_irq_enable();
+
+ /* Get the counter details */
+ num_counters = get_num_counters();
+ update_counter_info(num_counters);
+
+ /*
+ * Qemu supports overflow for cycle/instruction.
+ * This test may fail on any platform that do not support overflow for these two events.
+ */
+ test_pmu_event_overflow(SBI_PMU_HW_CPU_CYCLES);
+ GUEST_ASSERT_EQ(vcpu_shared_irq_count, 1);
+
+ test_pmu_event_overflow(SBI_PMU_HW_INSTRUCTIONS);
+ GUEST_ASSERT_EQ(vcpu_shared_irq_count, 2);
+
+ GUEST_DONE();
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct ucall uc;
+
+ vcpu_run(vcpu);
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ break;
+ case UCALL_DONE:
+ case UCALL_SYNC:
+ break;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ break;
+ }
+}
+
+void test_vm_destroy(struct kvm_vm *vm)
+{
+ memset(ctrinfo_arr, 0, sizeof(union sbi_pmu_ctr_info) * RISCV_MAX_PMU_COUNTERS);
+ counter_mask_available = 0;
+ kvm_vm_free(vm);
+}
+
+static void test_vm_basic_test(void *guest_code)
+{
+ struct kvm_vm *vm;
+ struct kvm_vcpu *vcpu;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+ __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU),
+ "SBI PMU not available, skipping test");
+ vm_init_vector_tables(vm);
+ /* Illegal instruction handler is required to verify read access without configuration */
+ vm_install_exception_handler(vm, EXC_INST_ILLEGAL, guest_illegal_exception_handler);
+
+ vcpu_init_vector_tables(vcpu);
+ run_vcpu(vcpu);
+
+ test_vm_destroy(vm);
+}
+
+static void test_vm_events_test(void *guest_code)
+{
+ struct kvm_vm *vm = NULL;
+ struct kvm_vcpu *vcpu = NULL;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+ __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU),
+ "SBI PMU not available, skipping test");
+ run_vcpu(vcpu);
+
+ test_vm_destroy(vm);
+}
+
+static void test_vm_setup_snapshot_mem(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
+{
+ /* PMU Snapshot requires single page only */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, PMU_SNAPSHOT_GPA_BASE, 1, 1, 0);
+ /* PMU_SNAPSHOT_GPA_BASE is identity mapped */
+ virt_map(vm, PMU_SNAPSHOT_GPA_BASE, PMU_SNAPSHOT_GPA_BASE, 1);
+
+ snapshot_gva = (void *)(PMU_SNAPSHOT_GPA_BASE);
+ snapshot_gpa = addr_gva2gpa(vcpu->vm, (vm_vaddr_t)snapshot_gva);
+ sync_global_to_guest(vcpu->vm, snapshot_gva);
+ sync_global_to_guest(vcpu->vm, snapshot_gpa);
+}
+
+static void test_vm_events_snapshot_test(void *guest_code)
+{
+ struct kvm_vm *vm = NULL;
+ struct kvm_vcpu *vcpu;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+ __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU),
+ "SBI PMU not available, skipping test");
+
+ test_vm_setup_snapshot_mem(vm, vcpu);
+
+ run_vcpu(vcpu);
+
+ test_vm_destroy(vm);
+}
+
+static void test_vm_events_overflow(void *guest_code)
+{
+ struct kvm_vm *vm = NULL;
+ struct kvm_vcpu *vcpu;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+ __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU),
+ "SBI PMU not available, skipping test");
+
+ __TEST_REQUIRE(__vcpu_has_isa_ext(vcpu, KVM_RISCV_ISA_EXT_SSCOFPMF),
+ "Sscofpmf is not available, skipping overflow test");
+
+ test_vm_setup_snapshot_mem(vm, vcpu);
+ vm_init_vector_tables(vm);
+ vm_install_interrupt_handler(vm, guest_irq_handler);
+
+ vcpu_init_vector_tables(vcpu);
+ /* Initialize guest timer frequency. */
+ vcpu_get_reg(vcpu, RISCV_TIMER_REG(frequency), &timer_freq);
+ sync_global_to_guest(vm, timer_freq);
+
+ run_vcpu(vcpu);
+
+ test_vm_destroy(vm);
+}
+
+static void test_print_help(char *name)
+{
+ pr_info("Usage: %s [-h] [-d <test name>]\n", name);
+ pr_info("\t-d: Test to disable. Available tests are 'basic', 'events', 'snapshot', 'overflow'\n");
+ pr_info("\t-h: print this help screen\n");
+}
+
+static bool parse_args(int argc, char *argv[])
+{
+ int opt;
+
+ while ((opt = getopt(argc, argv, "hd:")) != -1) {
+ switch (opt) {
+ case 'd':
+ if (!strncmp("basic", optarg, 5))
+ disabled_tests |= SBI_PMU_TEST_BASIC;
+ else if (!strncmp("events", optarg, 6))
+ disabled_tests |= SBI_PMU_TEST_EVENTS;
+ else if (!strncmp("snapshot", optarg, 8))
+ disabled_tests |= SBI_PMU_TEST_SNAPSHOT;
+ else if (!strncmp("overflow", optarg, 8))
+ disabled_tests |= SBI_PMU_TEST_OVERFLOW;
+ else
+ goto done;
+ break;
+ case 'h':
+ default:
+ goto done;
+ }
+ }
+
+ return true;
+done:
+ test_print_help(argv[0]);
+ return false;
+}
+
+int main(int argc, char *argv[])
+{
+ if (!parse_args(argc, argv))
+ exit(KSFT_SKIP);
+
+ if (!(disabled_tests & SBI_PMU_TEST_BASIC)) {
+ test_vm_basic_test(test_pmu_basic_sanity);
+ pr_info("SBI PMU basic test : PASS\n");
+ }
+
+ if (!(disabled_tests & SBI_PMU_TEST_EVENTS)) {
+ test_vm_events_test(test_pmu_events);
+ pr_info("SBI PMU event verification test : PASS\n");
+ }
+
+ if (!(disabled_tests & SBI_PMU_TEST_SNAPSHOT)) {
+ test_vm_events_snapshot_test(test_pmu_events_snaphost);
+ pr_info("SBI PMU event verification with snapshot test : PASS\n");
+ }
+
+ if (!(disabled_tests & SBI_PMU_TEST_OVERFLOW)) {
+ test_vm_events_overflow(test_pmu_events_overflow);
+ pr_info("SBI PMU event verification with overflow test : PASS\n");
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index bd57d991e27d..68c899d27561 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -339,7 +339,7 @@ static void test_invalid_memory_region_flags(void)
#ifdef __x86_64__
if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))
- vm = vm_create_barebones_protected_vm();
+ vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM);
else
#endif
vm = vm_create_barebones();
@@ -462,7 +462,7 @@ static void test_add_private_memory_region(void)
pr_info("Testing ADD of KVM_MEM_GUEST_MEMFD memory regions\n");
- vm = vm_create_barebones_protected_vm();
+ vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM);
test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail");
test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail");
@@ -471,7 +471,7 @@ static void test_add_private_memory_region(void)
test_invalid_guest_memfd(vm, memfd, 0, "Regular memfd() should fail");
close(memfd);
- vm2 = vm_create_barebones_protected_vm();
+ vm2 = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM);
memfd = vm_create_guest_memfd(vm2, MEM_REGION_SIZE, 0);
test_invalid_guest_memfd(vm, memfd, 0, "Other VM's guest_memfd() should fail");
@@ -499,7 +499,7 @@ static void test_add_overlapping_private_memory_regions(void)
pr_info("Testing ADD of overlapping KVM_MEM_GUEST_MEMFD memory regions\n");
- vm = vm_create_barebones_protected_vm();
+ vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM);
memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE * 4, 0);
diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c
index bae0c5026f82..2ff82c7fd926 100644
--- a/tools/testing/selftests/kvm/steal_time.c
+++ b/tools/testing/selftests/kvm/steal_time.c
@@ -11,7 +11,9 @@
#include <pthread.h>
#include <linux/kernel.h>
#include <asm/kvm.h>
-#ifndef __riscv
+#ifdef __riscv
+#include "sbi.h"
+#else
#include <asm/kvm_para.h>
#endif
diff --git a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c
new file mode 100644
index 000000000000..7a4a61be119b
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm.h>
+#include <linux/psp-sev.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "kselftest.h"
+
+#define SVM_SEV_FEAT_DEBUG_SWAP 32u
+
+/*
+ * Some features may have hidden dependencies, or may only work
+ * for certain VM types. Err on the side of safety and don't
+ * expect that all supported features can be passed one by one
+ * to KVM_SEV_INIT2.
+ *
+ * (Well, right now there's only one...)
+ */
+#define KNOWN_FEATURES SVM_SEV_FEAT_DEBUG_SWAP
+
+int kvm_fd;
+u64 supported_vmsa_features;
+bool have_sev_es;
+
+static int __sev_ioctl(int vm_fd, int cmd_id, void *data)
+{
+ struct kvm_sev_cmd cmd = {
+ .id = cmd_id,
+ .data = (uint64_t)data,
+ .sev_fd = open_sev_dev_path_or_exit(),
+ };
+ int ret;
+
+ ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd);
+ TEST_ASSERT(ret < 0 || cmd.error == SEV_RET_SUCCESS,
+ "%d failed: fw error: %d\n",
+ cmd_id, cmd.error);
+
+ return ret;
+}
+
+static void test_init2(unsigned long vm_type, struct kvm_sev_init *init)
+{
+ struct kvm_vm *vm;
+ int ret;
+
+ vm = vm_create_barebones_type(vm_type);
+ ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init);
+ TEST_ASSERT(ret == 0,
+ "KVM_SEV_INIT2 return code is %d (expected 0), errno: %d",
+ ret, errno);
+ kvm_vm_free(vm);
+}
+
+static void test_init2_invalid(unsigned long vm_type, struct kvm_sev_init *init, const char *msg)
+{
+ struct kvm_vm *vm;
+ int ret;
+
+ vm = vm_create_barebones_type(vm_type);
+ ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init);
+ TEST_ASSERT(ret == -1 && errno == EINVAL,
+ "KVM_SEV_INIT2 should fail, %s.",
+ msg);
+ kvm_vm_free(vm);
+}
+
+void test_vm_types(void)
+{
+ test_init2(KVM_X86_SEV_VM, &(struct kvm_sev_init){});
+
+ /*
+ * TODO: check that unsupported types cannot be created. Probably
+ * a separate selftest.
+ */
+ if (have_sev_es)
+ test_init2(KVM_X86_SEV_ES_VM, &(struct kvm_sev_init){});
+
+ test_init2_invalid(0, &(struct kvm_sev_init){},
+ "VM type is KVM_X86_DEFAULT_VM");
+ if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))
+ test_init2_invalid(KVM_X86_SW_PROTECTED_VM, &(struct kvm_sev_init){},
+ "VM type is KVM_X86_SW_PROTECTED_VM");
+}
+
+void test_flags(uint32_t vm_type)
+{
+ int i;
+
+ for (i = 0; i < 32; i++)
+ test_init2_invalid(vm_type,
+ &(struct kvm_sev_init){ .flags = BIT(i) },
+ "invalid flag");
+}
+
+void test_features(uint32_t vm_type, uint64_t supported_features)
+{
+ int i;
+
+ for (i = 0; i < 64; i++) {
+ if (!(supported_features & (1u << i)))
+ test_init2_invalid(vm_type,
+ &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) },
+ "unknown feature");
+ else if (KNOWN_FEATURES & (1u << i))
+ test_init2(vm_type,
+ &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) });
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ int kvm_fd = open_kvm_dev_path_or_exit();
+ bool have_sev;
+
+ TEST_REQUIRE(__kvm_has_device_attr(kvm_fd, KVM_X86_GRP_SEV,
+ KVM_X86_SEV_VMSA_FEATURES) == 0);
+ kvm_device_attr_get(kvm_fd, KVM_X86_GRP_SEV,
+ KVM_X86_SEV_VMSA_FEATURES,
+ &supported_vmsa_features);
+
+ have_sev = kvm_cpu_has(X86_FEATURE_SEV);
+ TEST_ASSERT(have_sev == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)),
+ "sev: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)",
+ kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_VM);
+
+ TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM));
+ have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES);
+
+ TEST_ASSERT(have_sev_es == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM)),
+ "sev-es: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)",
+ kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_ES_VM);
+
+ test_vm_types();
+
+ test_flags(KVM_X86_SEV_VM);
+ if (have_sev_es)
+ test_flags(KVM_X86_SEV_ES_VM);
+
+ test_features(KVM_X86_SEV_VM, 0);
+ if (have_sev_es)
+ test_features(KVM_X86_SEV_ES_VM, supported_vmsa_features);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
index 026779f3ed06..7c70c0da4fb7 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
@@ -4,6 +4,7 @@
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
+#include <math.h>
#include "test_util.h"
#include "kvm_util.h"
@@ -13,6 +14,8 @@
#include "sev.h"
+#define XFEATURE_MASK_X87_AVX (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM)
+
static void guest_sev_es_code(void)
{
/* TODO: Check CPUID after GHCB-based hypercall support is added. */
@@ -35,13 +38,98 @@ static void guest_sev_code(void)
GUEST_DONE();
}
+/* Stash state passed via VMSA before any compiled code runs. */
+extern void guest_code_xsave(void);
+asm("guest_code_xsave:\n"
+ "mov $-1, %eax\n"
+ "mov $-1, %edx\n"
+ "xsave (%rdi)\n"
+ "jmp guest_sev_es_code");
+
+static void compare_xsave(u8 *from_host, u8 *from_guest)
+{
+ int i;
+ bool bad = false;
+ for (i = 0; i < 4095; i++) {
+ if (from_host[i] != from_guest[i]) {
+ printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]);
+ bad = true;
+ }
+ }
+
+ if (bad)
+ abort();
+}
+
+static void test_sync_vmsa(uint32_t policy)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ vm_vaddr_t gva;
+ void *hva;
+
+ double x87val = M_PI;
+ struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 };
+ struct kvm_sregs sregs;
+ struct kvm_xcrs xcrs = {
+ .nr_xcrs = 1,
+ .xcrs[0].xcr = 0,
+ .xcrs[0].value = XFEATURE_MASK_X87_AVX,
+ };
+
+ vm = vm_sev_create_with_one_vcpu(KVM_X86_SEV_ES_VM, guest_code_xsave, &vcpu);
+ gva = vm_vaddr_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR,
+ MEM_REGION_TEST_DATA);
+ hva = addr_gva2hva(vm, gva);
+
+ vcpu_args_set(vcpu, 1, gva);
+
+ vcpu_sregs_get(vcpu, &sregs);
+ sregs.cr4 |= X86_CR4_OSFXSR | X86_CR4_OSXSAVE;
+ vcpu_sregs_set(vcpu, &sregs);
+
+ vcpu_xcrs_set(vcpu, &xcrs);
+ asm("fninit\n"
+ "vpcmpeqb %%ymm4, %%ymm4, %%ymm4\n"
+ "fldl %3\n"
+ "xsave (%2)\n"
+ "fstp %%st\n"
+ : "=m"(xsave)
+ : "A"(XFEATURE_MASK_X87_AVX), "r"(&xsave), "m" (x87val)
+ : "ymm4", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
+ vcpu_xsave_set(vcpu, &xsave);
+
+ vm_sev_launch(vm, SEV_POLICY_ES | policy, NULL);
+
+ /* This page is shared, so make it decrypted. */
+ memset(hva, 0, 4096);
+
+ vcpu_run(vcpu);
+
+ TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT,
+ "Wanted SYSTEM_EVENT, got %s",
+ exit_reason_str(vcpu->run->exit_reason));
+ TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM);
+ TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1);
+ TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ);
+
+ compare_xsave((u8 *)&xsave, (u8 *)hva);
+
+ kvm_vm_free(vm);
+}
+
static void test_sev(void *guest_code, uint64_t policy)
{
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
struct ucall uc;
- vm = vm_sev_create_with_one_vcpu(policy, guest_code, &vcpu);
+ uint32_t type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM;
+
+ vm = vm_sev_create_with_one_vcpu(type, guest_code, &vcpu);
+
+ /* TODO: Validate the measurement is as expected. */
+ vm_sev_launch(vm, policy, NULL);
for (;;) {
vcpu_run(vcpu);
@@ -82,6 +170,12 @@ int main(int argc, char *argv[])
if (kvm_cpu_has(X86_FEATURE_SEV_ES)) {
test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG);
test_sev(guest_sev_es_code, SEV_POLICY_ES);
+
+ if (kvm_has_cap(KVM_CAP_XCRS) &&
+ (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) {
+ test_sync_vmsa(0);
+ test_sync_vmsa(SEV_POLICY_NO_DBG);
+ }
}
return 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ff0a20565f90..38694f95bb56 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -401,12 +401,17 @@ static void kvm_flush_shadow_all(struct kvm *kvm)
static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
gfp_t gfp_flags)
{
+ void *page;
+
gfp_flags |= mc->gfp_zero;
if (mc->kmem_cache)
return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
- else
- return (void *)__get_free_page(gfp_flags);
+
+ page = (void *)__get_free_page(gfp_flags);
+ if (page && mc->init_value)
+ memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
+ return page;
}
int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
@@ -421,6 +426,13 @@ int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity,
if (WARN_ON_ONCE(!capacity))
return -EIO;
+ /*
+ * Custom init values can be used only for page allocations,
+ * and obviously conflict with __GFP_ZERO.
+ */
+ if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
+ return -EIO;
+
mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
if (!mc->objects)
return -ENOMEM;
@@ -583,8 +595,6 @@ static void kvm_null_fn(void)
}
#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
-static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
-
/* Iterate over each memslot intersecting [start, last] (inclusive) range */
#define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
@@ -670,14 +680,12 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
unsigned long start,
unsigned long end,
- union kvm_mmu_notifier_arg arg,
gfn_handler_t handler)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
const struct kvm_mmu_notifier_range range = {
.start = start,
.end = end,
- .arg = arg,
.handler = handler,
.on_lock = (void *)kvm_null_fn,
.flush_on_ret = true,
@@ -705,48 +713,6 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
return __kvm_handle_hva_range(kvm, &range).ret;
}
-static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- /*
- * Skipping invalid memslots is correct if and only change_pte() is
- * surrounded by invalidate_range_{start,end}(), which is currently
- * guaranteed by the primary MMU. If that ever changes, KVM needs to
- * unmap the memslot instead of skipping the memslot to ensure that KVM
- * doesn't hold references to the old PFN.
- */
- WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
-
- if (range->slot->flags & KVM_MEMSLOT_INVALID)
- return false;
-
- return kvm_set_spte_gfn(kvm, range);
-}
-
-static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long address,
- pte_t pte)
-{
- struct kvm *kvm = mmu_notifier_to_kvm(mn);
- const union kvm_mmu_notifier_arg arg = { .pte = pte };
-
- trace_kvm_set_spte_hva(address);
-
- /*
- * .change_pte() must be surrounded by .invalidate_range_{start,end}().
- * If mmu_invalidate_in_progress is zero, then no in-progress
- * invalidations, including this one, found a relevant memslot at
- * start(); rechecking memslots here is unnecessary. Note, a false
- * positive (count elevated by a different invalidation) is sub-optimal
- * but functionally ok.
- */
- WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
- if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
- return;
-
- kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
-}
-
void kvm_mmu_invalidate_begin(struct kvm *kvm)
{
lockdep_assert_held_write(&kvm->mmu_lock);
@@ -909,8 +875,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
{
trace_kvm_age_hva(start, end);
- return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
- kvm_age_gfn);
+ return kvm_handle_hva_range(mn, start, end, kvm_age_gfn);
}
static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@ -963,7 +928,6 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
.clear_young = kvm_mmu_notifier_clear_young,
.test_young = kvm_mmu_notifier_test_young,
- .change_pte = kvm_mmu_notifier_change_pte,
.release = kvm_mmu_notifier_release,
};