aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virt/kvm/index.rst2
-rw-r--r--Documentation/virt/kvm/running-nested-guests.rst276
-rw-r--r--arch/arm64/kvm/guest.c7
-rw-r--r--arch/arm64/kvm/hyp/entry.S23
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S1
-rw-r--r--arch/arm64/kvm/hyp/sysreg-sr.c17
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c9
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c9
-rw-r--r--arch/powerpc/kvm/powerpc.c1
-rw-r--r--arch/s390/kvm/kvm-s390.c4
-rw-r--r--arch/x86/include/asm/kvm_host.h4
-rw-r--r--arch/x86/include/asm/nospec-branch.h21
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/ioapic.c10
-rw-r--r--arch/x86/kvm/svm/sev.c6
-rw-r--r--arch/x86/kvm/svm/svm.c12
-rw-r--r--arch/x86/kvm/svm/vmenter.S10
-rw-r--r--arch/x86/kvm/vmx/nested.c23
-rw-r--r--arch/x86/kvm/vmx/vmenter.S3
-rw-r--r--arch/x86/kvm/vmx/vmx.c2
-rw-r--r--arch/x86/kvm/x86.c42
-rw-r--r--include/linux/kvm_host.h2
-rw-r--r--tools/testing/selftests/kvm/include/evmcs.h4
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/vmx.c3
-rw-r--r--virt/kvm/arm/hyp/aarch32.c8
-rw-r--r--virt/kvm/arm/psci.c40
-rw-r--r--virt/kvm/arm/vgic/vgic-init.c19
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c11
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v2.c16
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v3.c31
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.c228
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.h19
32 files changed, 697 insertions, 170 deletions
diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst
index dcc252634cf9..b6833c7bb474 100644
--- a/Documentation/virt/kvm/index.rst
+++ b/Documentation/virt/kvm/index.rst
@@ -28,3 +28,5 @@ KVM
arm/index
devices/index
+
+ running-nested-guests
diff --git a/Documentation/virt/kvm/running-nested-guests.rst b/Documentation/virt/kvm/running-nested-guests.rst
new file mode 100644
index 000000000000..d0a1fc754c84
--- /dev/null
+++ b/Documentation/virt/kvm/running-nested-guests.rst
@@ -0,0 +1,276 @@
+==============================
+Running nested guests with KVM
+==============================
+
+A nested guest is the ability to run a guest inside another guest (it
+can be KVM-based or a different hypervisor). The straightforward
+example is a KVM guest that in turn runs on a KVM guest (the rest of
+this document is built on this example)::
+
+ .----------------. .----------------.
+ | | | |
+ | L2 | | L2 |
+ | (Nested Guest) | | (Nested Guest) |
+ | | | |
+ |----------------'--'----------------|
+ | |
+ | L1 (Guest Hypervisor) |
+ | KVM (/dev/kvm) |
+ | |
+ .------------------------------------------------------.
+ | L0 (Host Hypervisor) |
+ | KVM (/dev/kvm) |
+ |------------------------------------------------------|
+ | Hardware (with virtualization extensions) |
+ '------------------------------------------------------'
+
+Terminology:
+
+- L0 – level-0; the bare metal host, running KVM
+
+- L1 – level-1 guest; a VM running on L0; also called the "guest
+ hypervisor", as it itself is capable of running KVM.
+
+- L2 – level-2 guest; a VM running on L1, this is the "nested guest"
+
+.. note:: The above diagram is modelled after the x86 architecture;
+ s390x, ppc64 and other architectures are likely to have
+ a different design for nesting.
+
+ For example, s390x always has an LPAR (LogicalPARtition)
+ hypervisor running on bare metal, adding another layer and
+ resulting in at least four levels in a nested setup — L0 (bare
+ metal, running the LPAR hypervisor), L1 (host hypervisor), L2
+ (guest hypervisor), L3 (nested guest).
+
+ This document will stick with the three-level terminology (L0,
+ L1, and L2) for all architectures; and will largely focus on
+ x86.
+
+
+Use Cases
+---------
+
+There are several scenarios where nested KVM can be useful, to name a
+few:
+
+- As a developer, you want to test your software on different operating
+ systems (OSes). Instead of renting multiple VMs from a Cloud
+ Provider, using nested KVM lets you rent a large enough "guest
+ hypervisor" (level-1 guest). This in turn allows you to create
+ multiple nested guests (level-2 guests), running different OSes, on
+ which you can develop and test your software.
+
+- Live migration of "guest hypervisors" and their nested guests, for
+ load balancing, disaster recovery, etc.
+
+- VM image creation tools (e.g. ``virt-install``, etc) often run
+ their own VM, and users expect these to work inside a VM.
+
+- Some OSes use virtualization internally for security (e.g. to let
+ applications run safely in isolation).
+
+
+Enabling "nested" (x86)
+-----------------------
+
+From Linux kernel v4.19 onwards, the ``nested`` KVM parameter is enabled
+by default for Intel and AMD. (Though your Linux distribution might
+override this default.)
+
+In case you are running a Linux kernel older than v4.19, to enable
+nesting, set the ``nested`` KVM module parameter to ``Y`` or ``1``. To
+persist this setting across reboots, you can add it in a config file, as
+shown below:
+
+1. On the bare metal host (L0), list the kernel modules and ensure that
+ the KVM modules::
+
+ $ lsmod | grep -i kvm
+ kvm_intel 133627 0
+ kvm 435079 1 kvm_intel
+
+2. Show information for ``kvm_intel`` module::
+
+ $ modinfo kvm_intel | grep -i nested
+ parm: nested:bool
+
+3. For the nested KVM configuration to persist across reboots, place the
+ below in ``/etc/modprobed/kvm_intel.conf`` (create the file if it
+ doesn't exist)::
+
+ $ cat /etc/modprobe.d/kvm_intel.conf
+ options kvm-intel nested=y
+
+4. Unload and re-load the KVM Intel module::
+
+ $ sudo rmmod kvm-intel
+ $ sudo modprobe kvm-intel
+
+5. Verify if the ``nested`` parameter for KVM is enabled::
+
+ $ cat /sys/module/kvm_intel/parameters/nested
+ Y
+
+For AMD hosts, the process is the same as above, except that the module
+name is ``kvm-amd``.
+
+
+Additional nested-related kernel parameters (x86)
+-------------------------------------------------
+
+If your hardware is sufficiently advanced (Intel Haswell processor or
+higher, which has newer hardware virt extensions), the following
+additional features will also be enabled by default: "Shadow VMCS
+(Virtual Machine Control Structure)", APIC Virtualization on your bare
+metal host (L0). Parameters for Intel hosts::
+
+ $ cat /sys/module/kvm_intel/parameters/enable_shadow_vmcs
+ Y
+
+ $ cat /sys/module/kvm_intel/parameters/enable_apicv
+ Y
+
+ $ cat /sys/module/kvm_intel/parameters/ept
+ Y
+
+.. note:: If you suspect your L2 (i.e. nested guest) is running slower,
+ ensure the above are enabled (particularly
+ ``enable_shadow_vmcs`` and ``ept``).
+
+
+Starting a nested guest (x86)
+-----------------------------
+
+Once your bare metal host (L0) is configured for nesting, you should be
+able to start an L1 guest with::
+
+ $ qemu-kvm -cpu host [...]
+
+The above will pass through the host CPU's capabilities as-is to the
+gues); or for better live migration compatibility, use a named CPU
+model supported by QEMU. e.g.::
+
+ $ qemu-kvm -cpu Haswell-noTSX-IBRS,vmx=on
+
+then the guest hypervisor will subsequently be capable of running a
+nested guest with accelerated KVM.
+
+
+Enabling "nested" (s390x)
+-------------------------
+
+1. On the host hypervisor (L0), enable the ``nested`` parameter on
+ s390x::
+
+ $ rmmod kvm
+ $ modprobe kvm nested=1
+
+.. note:: On s390x, the kernel parameter ``hpage`` is mutually exclusive
+ with the ``nested`` paramter — i.e. to be able to enable
+ ``nested``, the ``hpage`` parameter *must* be disabled.
+
+2. The guest hypervisor (L1) must be provided with the ``sie`` CPU
+ feature — with QEMU, this can be done by using "host passthrough"
+ (via the command-line ``-cpu host``).
+
+3. Now the KVM module can be loaded in the L1 (guest hypervisor)::
+
+ $ modprobe kvm
+
+
+Live migration with nested KVM
+------------------------------
+
+Migrating an L1 guest, with a *live* nested guest in it, to another
+bare metal host, works as of Linux kernel 5.3 and QEMU 4.2.0 for
+Intel x86 systems, and even on older versions for s390x.
+
+On AMD systems, once an L1 guest has started an L2 guest, the L1 guest
+should no longer be migrated or saved (refer to QEMU documentation on
+"savevm"/"loadvm") until the L2 guest shuts down. Attempting to migrate
+or save-and-load an L1 guest while an L2 guest is running will result in
+undefined behavior. You might see a ``kernel BUG!`` entry in ``dmesg``, a
+kernel 'oops', or an outright kernel panic. Such a migrated or loaded L1
+guest can no longer be considered stable or secure, and must be restarted.
+Migrating an L1 guest merely configured to support nesting, while not
+actually running L2 guests, is expected to function normally even on AMD
+systems but may fail once guests are started.
+
+Migrating an L2 guest is always expected to succeed, so all the following
+scenarios should work even on AMD systems:
+
+- Migrating a nested guest (L2) to another L1 guest on the *same* bare
+ metal host.
+
+- Migrating a nested guest (L2) to another L1 guest on a *different*
+ bare metal host.
+
+- Migrating a nested guest (L2) to a bare metal host.
+
+Reporting bugs from nested setups
+-----------------------------------
+
+Debugging "nested" problems can involve sifting through log files across
+L0, L1 and L2; this can result in tedious back-n-forth between the bug
+reporter and the bug fixer.
+
+- Mention that you are in a "nested" setup. If you are running any kind
+ of "nesting" at all, say so. Unfortunately, this needs to be called
+ out because when reporting bugs, people tend to forget to even
+ *mention* that they're using nested virtualization.
+
+- Ensure you are actually running KVM on KVM. Sometimes people do not
+ have KVM enabled for their guest hypervisor (L1), which results in
+ them running with pure emulation or what QEMU calls it as "TCG", but
+ they think they're running nested KVM. Thus confusing "nested Virt"
+ (which could also mean, QEMU on KVM) with "nested KVM" (KVM on KVM).
+
+Information to collect (generic)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following is not an exhaustive list, but a very good starting point:
+
+ - Kernel, libvirt, and QEMU version from L0
+
+ - Kernel, libvirt and QEMU version from L1
+
+ - QEMU command-line of L1 -- when using libvirt, you'll find it here:
+ ``/var/log/libvirt/qemu/instance.log``
+
+ - QEMU command-line of L2 -- as above, when using libvirt, get the
+ complete libvirt-generated QEMU command-line
+
+ - ``cat /sys/cpuinfo`` from L0
+
+ - ``cat /sys/cpuinfo`` from L1
+
+ - ``lscpu`` from L0
+
+ - ``lscpu`` from L1
+
+ - Full ``dmesg`` output from L0
+
+ - Full ``dmesg`` output from L1
+
+x86-specific info to collect
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Both the below commands, ``x86info`` and ``dmidecode``, should be
+available on most Linux distributions with the same name:
+
+ - Output of: ``x86info -a`` from L0
+
+ - Output of: ``x86info -a`` from L1
+
+ - Output of: ``dmidecode`` from L0
+
+ - Output of: ``dmidecode`` from L1
+
+s390x-specific info to collect
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Along with the earlier mentioned generic details, the below is
+also recommended:
+
+ - ``/proc/sysinfo`` from L1; this will also include the info from L0
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 23ebe51410f0..50a279d3ddd7 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -200,6 +200,13 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
}
memcpy((u32 *)regs + off, valp, KVM_REG_SIZE(reg->id));
+
+ if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) {
+ int i;
+
+ for (i = 0; i < 16; i++)
+ *vcpu_reg32(vcpu, i) = (u32)*vcpu_reg32(vcpu, i);
+ }
out:
return err;
}
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index d22d0534dd60..90186cf6473e 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -18,6 +18,7 @@
#define CPU_GP_REG_OFFSET(x) (CPU_GP_REGS + x)
#define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x)
+#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8)
.text
.pushsection .hyp.text, "ax"
@@ -47,6 +48,16 @@
ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
.endm
+.macro save_sp_el0 ctxt, tmp
+ mrs \tmp, sp_el0
+ str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
+.endm
+
+.macro restore_sp_el0 ctxt, tmp
+ ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
+ msr sp_el0, \tmp
+.endm
+
/*
* u64 __guest_enter(struct kvm_vcpu *vcpu,
* struct kvm_cpu_context *host_ctxt);
@@ -60,6 +71,9 @@ SYM_FUNC_START(__guest_enter)
// Store the host regs
save_callee_saved_regs x1
+ // Save the host's sp_el0
+ save_sp_el0 x1, x2
+
// Now the host state is stored if we have a pending RAS SError it must
// affect the host. If any asynchronous exception is pending we defer
// the guest entry. The DSB isn't necessary before v8.2 as any SError
@@ -83,6 +97,9 @@ alternative_else_nop_endif
// when this feature is enabled for kernel code.
ptrauth_switch_to_guest x29, x0, x1, x2
+ // Restore the guest's sp_el0
+ restore_sp_el0 x29, x0
+
// Restore guest regs x0-x17
ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
@@ -130,6 +147,9 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// Store the guest regs x18-x29, lr
save_callee_saved_regs x1
+ // Store the guest's sp_el0
+ save_sp_el0 x1, x2
+
get_host_ctxt x2, x3
// Macro ptrauth_switch_to_guest format:
@@ -139,6 +159,9 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// when this feature is enabled for kernel code.
ptrauth_switch_to_host x1, x2, x3, x4, x5
+ // Restore the hosts's sp_el0
+ restore_sp_el0 x2, x3
+
// Now restore the host regs
restore_callee_saved_regs x2
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index c2a13ab3c471..9c5cfb04170e 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -198,7 +198,6 @@ SYM_CODE_END(__hyp_panic)
.macro invalid_vector label, target = __hyp_panic
.align 2
SYM_CODE_START(\label)
-\label:
b \target
SYM_CODE_END(\label)
.endm
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 75b1925763f1..6d2df9fe0b5d 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -15,8 +15,9 @@
/*
* Non-VHE: Both host and guest must save everything.
*
- * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and pstate,
- * which are handled as part of the el2 return state) on every switch.
+ * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and
+ * pstate, which are handled as part of the el2 return state) on every
+ * switch (sp_el0 is being dealt with in the assembly code).
* tpidr_el0 and tpidrro_el0 only need to be switched when going
* to host userspace or a different VCPU. EL1 registers only need to be
* switched when potentially going to run a different VCPU. The latter two
@@ -26,12 +27,6 @@
static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
{
ctxt->sys_regs[MDSCR_EL1] = read_sysreg(mdscr_el1);
-
- /*
- * The host arm64 Linux uses sp_el0 to point to 'current' and it must
- * therefore be saved/restored on every entry/exit to/from the guest.
- */
- ctxt->gp_regs.regs.sp = read_sysreg(sp_el0);
}
static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
@@ -99,12 +94,6 @@ NOKPROBE_SYMBOL(sysreg_save_guest_state_vhe);
static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
{
write_sysreg(ctxt->sys_regs[MDSCR_EL1], mdscr_el1);
-
- /*
- * The host arm64 Linux uses sp_el0 to point to 'current' and it must
- * therefore be saved/restored on every entry/exit to/from the guest.
- */
- write_sysreg(ctxt->gp_regs.regs.sp, sp_el0);
}
static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 6404df613ea3..2b35f9bcf892 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -604,18 +604,19 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
*/
local_irq_disable();
ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
+ pte = __pte(0);
+ if (ptep)
+ pte = *ptep;
+ local_irq_enable();
/*
* If the PTE disappeared temporarily due to a THP
* collapse, just return and let the guest try again.
*/
- if (!ptep) {
- local_irq_enable();
+ if (!pte_present(pte)) {
if (page)
put_page(page);
return RESUME_GUEST;
}
- pte = *ptep;
- local_irq_enable();
hpa = pte_pfn(pte) << PAGE_SHIFT;
pte_size = PAGE_SIZE;
if (shift)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 9f050064d2a2..aa12cd4078b3 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -815,18 +815,19 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
*/
local_irq_disable();
ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
+ pte = __pte(0);
+ if (ptep)
+ pte = *ptep;
+ local_irq_enable();
/*
* If the PTE disappeared temporarily due to a THP
* collapse, just return and let the guest try again.
*/
- if (!ptep) {
- local_irq_enable();
+ if (!pte_present(pte)) {
if (page)
put_page(page);
return RESUME_GUEST;
}
- pte = *ptep;
- local_irq_enable();
/* If we're logging dirty pages, always map single pages */
large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index e15166b0a16d..ad2f172c26a6 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -521,6 +521,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_IOEVENTFD:
case KVM_CAP_DEVICE_CTRL:
case KVM_CAP_IMMEDIATE_EXIT:
+ case KVM_CAP_SET_GUEST_DEBUG:
r = 1;
break;
case KVM_CAP_PPC_GUEST_DEBUG_SSTEP:
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 19a81024fe16..d05bb040fd42 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -545,6 +545,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_AIS:
case KVM_CAP_S390_AIS_MIGRATION:
case KVM_CAP_S390_VCPU_RESETS:
+ case KVM_CAP_SET_GUEST_DEBUG:
r = 1;
break;
case KVM_CAP_S390_HPAGE_1M:
@@ -1939,6 +1940,9 @@ static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
start = slot + 1;
}
+ if (start >= slots->used_slots)
+ return slots->used_slots - 1;
+
if (gfn >= memslots[start].base_gfn &&
gfn < memslots[start].base_gfn + memslots[start].npages) {
atomic_set(&slots->lru_slot, start);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 42a2d0d3984a..0dea9f122bb9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1663,8 +1663,8 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
{
/* We can only post Fixed and LowPrio IRQs */
- return (irq->delivery_mode == dest_Fixed ||
- irq->delivery_mode == dest_LowestPrio);
+ return (irq->delivery_mode == APIC_DM_FIXED ||
+ irq->delivery_mode == APIC_DM_LOWEST);
}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 07e95dcb40ad..7e9a281e2660 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -237,27 +237,6 @@ enum ssb_mitigation {
extern char __indirect_thunk_start[];
extern char __indirect_thunk_end[];
-/*
- * On VMEXIT we must ensure that no RSB predictions learned in the guest
- * can be followed in the host, by overwriting the RSB completely. Both
- * retpoline and IBRS mitigations for Spectre v2 need this; only on future
- * CPUs with IBRS_ALL *might* it be avoided.
- */
-static inline void vmexit_fill_RSB(void)
-{
-#ifdef CONFIG_RETPOLINE
- unsigned long loops;
-
- asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
- ALTERNATIVE("jmp 910f",
- __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
- X86_FEATURE_RETPOLINE)
- "910:"
- : "=r" (loops), ASM_CALL_CONSTRAINT
- : : "memory" );
-#endif
-}
-
static __always_inline
void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
{
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a789759b7261..4a3081e9f4b5 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,6 +3,10 @@
ccflags-y += -Iarch/x86/kvm
ccflags-$(CONFIG_KVM_WERROR) += -Werror
+ifeq ($(CONFIG_FRAME_POINTER),y)
+OBJECT_FILES_NON_STANDARD_vmenter.o := y
+endif
+
KVM := ../../../virt/kvm
kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 750ff0b29404..d057376bd3d3 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -225,12 +225,12 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
}
/*
- * AMD SVM AVIC accelerate EOI write and do not trap,
- * in-kernel IOAPIC will not be able to receive the EOI.
- * In this case, we do lazy update of the pending EOI when
- * trying to set IOAPIC irq.
+ * AMD SVM AVIC accelerate EOI write iff the interrupt is edge
+ * triggered, in which case the in-kernel IOAPIC will not be able
+ * to receive the EOI. In this case, we do a lazy update of the
+ * pending EOI when trying to set IOAPIC irq.
*/
- if (kvm_apicv_activated(ioapic->kvm))
+ if (edge && kvm_apicv_activated(ioapic->kvm))
ioapic_lazy_update_eoi(ioapic, irq);
/*
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0e3fc311d7da..cf912b4aaba8 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -12,6 +12,7 @@
#include <linux/kernel.h>
#include <linux/highmem.h>
#include <linux/psp-sev.h>
+#include <linux/pagemap.h>
#include <linux/swap.h>
#include "x86.h"
@@ -1117,7 +1118,7 @@ int __init sev_hardware_setup(void)
/* Maximum number of encrypted guests supported simultaneously */
max_sev_asid = cpuid_ecx(0x8000001F);
- if (!max_sev_asid)
+ if (!svm_sev_enabled())
return 1;
/* Minimum ASID value that should be used for SEV guest */
@@ -1156,6 +1157,9 @@ err:
void sev_hardware_teardown(void)
{
+ if (!svm_sev_enabled())
+ return;
+
bitmap_free(sev_asid_bitmap);
bitmap_free(sev_reclaim_asid_bitmap);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 2be5bbae3a40..38f6aeefeb55 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1752,6 +1752,8 @@ static int db_interception(struct vcpu_svm *svm)
if (svm->vcpu.guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
+ kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
kvm_run->debug.arch.pc =
svm->vmcb->save.cs.base + svm->vmcb->save.rip;
kvm_run->debug.arch.exception = DB_VECTOR;
@@ -3276,7 +3278,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
svm_complete_interrupts(svm);
}
-bool __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
+void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
static void svm_vcpu_run(struct kvm_vcpu *vcpu)
{
@@ -3330,13 +3332,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
*/
x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- local_irq_enable();
-
__svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
- /* Eliminate branch target predictions from guest mode */
- vmexit_fill_RSB();
-
#ifdef CONFIG_X86_64
wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
@@ -3366,8 +3363,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
reload_tss(vcpu);
- local_irq_disable();
-
x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
vcpu->arch.cr2 = svm->vmcb->save.cr2;
@@ -3411,7 +3406,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
mark_all_clean(svm->vmcb);
}
-STACK_FRAME_NON_STANDARD(svm_vcpu_run);
static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
{
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index fa1af90067e9..bf944334003a 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -3,6 +3,7 @@
#include <asm/asm.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
+#include <asm/nospec-branch.h>
#define WORD_SIZE (BITS_PER_LONG / 8)
@@ -35,7 +36,6 @@
*/
SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BP
- mov %_ASM_SP, %_ASM_BP
#ifdef CONFIG_X86_64
push %r15
push %r14
@@ -78,6 +78,7 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %_ASM_AX
/* Enter guest mode */
+ sti
1: vmload %_ASM_AX
jmp 3f
2: cmpb $0, kvm_rebooting
@@ -99,6 +100,13 @@ SYM_FUNC_START(__svm_vcpu_run)
ud2
_ASM_EXTABLE(5b, 6b)
7:
+ cli
+
+#ifdef CONFIG_RETPOLINE
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+#endif
+
/* "POP" @regs to RAX. */
pop %_ASM_AX
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index cbc9ea2de28f..e44f33c82332 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5165,7 +5165,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
*/
break;
default:
- BUG_ON(1);
+ BUG();
break;
}
@@ -5533,8 +5533,25 @@ static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
return 1 & (b >> (field & 7));
}
+static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
+{
+ u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
+
+ if (nested_cpu_has_mtf(vmcs12))
+ return true;
+
+ /*
+ * An MTF VM-exit may be injected into the guest by setting the
+ * interruption-type to 7 (other event) and the vector field to 0. Such
+ * is the case regardless of the 'monitor trap flag' VM-execution
+ * control.
+ */
+ return entry_intr_info == (INTR_INFO_VALID_MASK
+ | INTR_TYPE_OTHER_EVENT);
+}
+
/*
- * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
+ * Return true if we should exit from L2 to L1 to handle an exit, or false if we
* should handle it ourselves in L0 (and then continue L2). Only call this
* when in is_guest_mode (L2).
*/
@@ -5633,7 +5650,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_MWAIT_INSTRUCTION:
return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
case EXIT_REASON_MONITOR_TRAP_FLAG:
- return nested_cpu_has_mtf(vmcs12);
+ return nested_vmx_exit_handled_mtf(vmcs12);
case EXIT_REASON_MONITOR_INSTRUCTION:
return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
case EXIT_REASON_PAUSE_INSTRUCTION:
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 87f3f24fef37..51d1a82742fd 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -82,6 +82,9 @@ SYM_FUNC_START(vmx_vmexit)
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+ /* Clear RFLAGS.CF and RFLAGS.ZF to preserve VM-Exit, i.e. !VM-Fail. */
+ or $1, %_ASM_AX
+
pop %_ASM_AX
.Lvmexit_skip_rsb:
#endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 83050977490c..c2c6335a998c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4572,7 +4572,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
*/
static void kvm_machine_check(void)
{
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_MCE)
struct pt_regs regs = {
.cs = 3, /* Fake ring 3 no matter what the guest ran on */
.flags = X86_EFLAGS_IF,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3bf2ecafd027..d786c7d27ce5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -926,19 +926,6 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr);
__reserved_bits; \
})
-static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
-{
- u64 reserved_bits = __cr4_reserved_bits(cpu_has, c);
-
- if (kvm_cpu_cap_has(X86_FEATURE_LA57))
- reserved_bits &= ~X86_CR4_LA57;
-
- if (kvm_cpu_cap_has(X86_FEATURE_UMIP))
- reserved_bits &= ~X86_CR4_UMIP;
-
- return reserved_bits;
-}
-
static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & cr4_reserved_bits)
@@ -3060,6 +3047,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_PERF_CTL:
case MSR_AMD64_DC_CFG:
case MSR_F15H_EX_CFG:
+ /*
+ * Intel Sandy Bridge CPUs must support the RAPL (running average power
+ * limit) MSRs. Just return 0, as we do not want to expose the host
+ * data here. Do not conditionalize this on CPUID, as KVM does not do
+ * so for existing CPU-specific MSRs.
+ */
+ case MSR_RAPL_POWER_UNIT:
+ case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
+ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
+ case MSR_PKG_ENERGY_STATUS: /* Total package */
+ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
msr_info->data = 0;
break;
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
@@ -3374,6 +3372,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
+ case KVM_CAP_SET_GUEST_DEBUG:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
@@ -5049,10 +5048,13 @@ set_identity_unlock:
r = -EFAULT;
if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
goto out;
+ mutex_lock(&kvm->lock);
r = -ENXIO;
if (!kvm->arch.vpit)
- goto out;
+ goto set_pit_out;
r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
+set_pit_out:
+ mutex_unlock(&kvm->lock);
break;
}
case KVM_GET_PIT2: {
@@ -5072,10 +5074,13 @@ set_identity_unlock:
r = -EFAULT;
if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
goto out;
+ mutex_lock(&kvm->lock);
r = -ENXIO;
if (!kvm->arch.vpit)
- goto out;
+ goto set_pit2_out;
r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+set_pit2_out:
+ mutex_unlock(&kvm->lock);
break;
}
case KVM_REINJECT_CONTROL: {
@@ -9658,7 +9663,9 @@ int kvm_arch_hardware_setup(void *opaque)
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
supported_xss = 0;
- cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
+#undef __kvm_cpu_cap_has
if (kvm_has_tsc_control) {
/*
@@ -9690,7 +9697,8 @@ int kvm_arch_check_processor_compat(void *opaque)
WARN_ON(!irqs_disabled());
- if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits)
+ if (__cr4_reserved_bits(cpu_has, c) !=
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
return -EIO;
return ops->check_processor_compatibility();
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6d58beb65454..01276e3d01b9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1048,7 +1048,7 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn)
start = slot + 1;
}
- if (gfn >= memslots[start].base_gfn &&
+ if (start < slots->used_slots && gfn >= memslots[start].base_gfn &&
gfn < memslots[start].base_gfn + memslots[start].npages) {
atomic_set(&slots->lru_slot, start);
return &memslots[start];
diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/evmcs.h
index d8f4d6bfe05d..a034438b6266 100644
--- a/tools/testing/selftests/kvm/include/evmcs.h
+++ b/tools/testing/selftests/kvm/include/evmcs.h
@@ -219,8 +219,8 @@ struct hv_enlightened_vmcs {
#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
(~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
-struct hv_enlightened_vmcs *current_evmcs;
-struct hv_vp_assist_page *current_vp_assist;
+extern struct hv_enlightened_vmcs *current_evmcs;
+extern struct hv_vp_assist_page *current_vp_assist;
int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
index 6f17f69394be..4ae104f6ce69 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
@@ -17,6 +17,9 @@
bool enable_evmcs;
+struct hv_enlightened_vmcs *current_evmcs;
+struct hv_vp_assist_page *current_vp_assist;
+
struct eptPageTableEntry {
uint64_t readable:1;
uint64_t writable:1;
diff --git a/virt/kvm/arm/hyp/aarch32.c b/virt/kvm/arm/hyp/aarch32.c
index d31f267961e7..25c0e47d57cb 100644
--- a/virt/kvm/arm/hyp/aarch32.c
+++ b/virt/kvm/arm/hyp/aarch32.c
@@ -125,12 +125,16 @@ static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu)
*/
void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
{
+ u32 pc = *vcpu_pc(vcpu);
bool is_thumb;
is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT);
if (is_thumb && !is_wide_instr)
- *vcpu_pc(vcpu) += 2;
+ pc += 2;
else
- *vcpu_pc(vcpu) += 4;
+ pc += 4;
+
+ *vcpu_pc(vcpu) = pc;
+
kvm_adjust_itstate(vcpu);
}
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index 14a162e295a9..ae364716ee40 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -186,6 +186,33 @@ static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
}
+static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ /*
+ * Zero the input registers' upper 32 bits. They will be fully
+ * zeroed on exit, so we're fine changing them in place.
+ */
+ for (i = 1; i < 4; i++)
+ vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i)));
+}
+
+static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn)
+{
+ switch(fn) {
+ case PSCI_0_2_FN64_CPU_SUSPEND:
+ case PSCI_0_2_FN64_CPU_ON:
+ case PSCI_0_2_FN64_AFFINITY_INFO:
+ /* Disallow these functions for 32bit guests */
+ if (vcpu_mode_is_32bit(vcpu))
+ return PSCI_RET_NOT_SUPPORTED;
+ break;
+ }
+
+ return 0;
+}
+
static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
@@ -193,6 +220,10 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
unsigned long val;
int ret = 1;
+ val = kvm_psci_check_allowed_function(vcpu, psci_fn);
+ if (val)
+ goto out;
+
switch (psci_fn) {
case PSCI_0_2_FN_PSCI_VERSION:
/*
@@ -210,12 +241,16 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
val = PSCI_RET_SUCCESS;
break;
case PSCI_0_2_FN_CPU_ON:
+ kvm_psci_narrow_to_32bit(vcpu);
+ fallthrough;
case PSCI_0_2_FN64_CPU_ON:
mutex_lock(&kvm->lock);
val = kvm_psci_vcpu_on(vcpu);
mutex_unlock(&kvm->lock);
break;
case PSCI_0_2_FN_AFFINITY_INFO:
+ kvm_psci_narrow_to_32bit(vcpu);
+ fallthrough;
case PSCI_0_2_FN64_AFFINITY_INFO:
val = kvm_psci_vcpu_affinity_info(vcpu);
break;
@@ -256,6 +291,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
break;
}
+out:
smccc_set_retval(vcpu, val, 0, 0, 0);
return ret;
}
@@ -273,6 +309,10 @@ static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
break;
case PSCI_1_0_FN_PSCI_FEATURES:
feature = smccc_get_arg1(vcpu);
+ val = kvm_psci_check_allowed_function(vcpu, feature);
+ if (val)
+ break;
+
switch(feature) {
case PSCI_0_2_FN_PSCI_VERSION:
case PSCI_0_2_FN_CPU_SUSPEND:
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index a963b9d766b7..32e32d67a127 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -294,8 +294,15 @@ int vgic_init(struct kvm *kvm)
}
}
- if (vgic_has_its(kvm)) {
+ if (vgic_has_its(kvm))
vgic_lpi_translation_cache_init(kvm);
+
+ /*
+ * If we have GICv4.1 enabled, unconditionnaly request enable the
+ * v4 support so that we get HW-accelerated vSGIs. Otherwise, only
+ * enable it if we present a virtual ITS to the guest.
+ */
+ if (vgic_supports_direct_msis(kvm)) {
ret = vgic_v4_init(kvm);
if (ret)
goto out;
@@ -348,6 +355,12 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ /*
+ * Retire all pending LPIs on this vcpu anyway as we're
+ * going to destroy it.
+ */
+ vgic_flush_pending_lpis(vcpu);
+
INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
}
@@ -359,10 +372,10 @@ static void __kvm_vgic_destroy(struct kvm *kvm)
vgic_debug_destroy(kvm);
- kvm_vgic_dist_destroy(kvm);
-
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vgic_vcpu_destroy(vcpu);
+
+ kvm_vgic_dist_destroy(kvm);
}
void kvm_vgic_destroy(struct kvm *kvm)
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index d53d34a33e35..c012a52b19f5 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -96,14 +96,21 @@ out_unlock:
* We "cache" the configuration table entries in our struct vgic_irq's.
* However we only have those structs for mapped IRQs, so we read in
* the respective config data from memory here upon mapping the LPI.
+ *
+ * Should any of these fail, behave as if we couldn't create the LPI
+ * by dropping the refcount and returning the error.
*/
ret = update_lpi_config(kvm, irq, NULL, false);
- if (ret)
+ if (ret) {
+ vgic_put_irq(kvm, irq);
return ERR_PTR(ret);
+ }
ret = vgic_v3_lpi_sync_pending_status(kvm, irq);
- if (ret)
+ if (ret) {
+ vgic_put_irq(kvm, irq);
return ERR_PTR(ret);
+ }
return irq;
}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index 5945f062d749..a016f07adc28 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -409,24 +409,28 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = {
NULL, vgic_mmio_uaccess_write_v2_group, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET,
- vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1,
+ vgic_mmio_read_enable, vgic_mmio_write_senable,
+ NULL, vgic_uaccess_write_senable, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR,
- vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1,
+ vgic_mmio_read_enable, vgic_mmio_write_cenable,
+ NULL, vgic_uaccess_write_cenable, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
- vgic_mmio_read_pending, vgic_mmio_write_spending, NULL, NULL, 1,
+ vgic_mmio_read_pending, vgic_mmio_write_spending,
+ NULL, vgic_uaccess_write_spending, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
- vgic_mmio_read_pending, vgic_mmio_write_cpending, NULL, NULL, 1,
+ vgic_mmio_read_pending, vgic_mmio_write_cpending,
+ NULL, vgic_uaccess_write_cpending, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
vgic_mmio_read_active, vgic_mmio_write_sactive,
- NULL, vgic_mmio_uaccess_write_sactive, 1,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
vgic_mmio_read_active, vgic_mmio_write_cactive,
- NULL, vgic_mmio_uaccess_write_cactive, 1,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index e72dcc454247..89a14ec8b33b 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -50,7 +50,8 @@ bool vgic_has_its(struct kvm *kvm)
bool vgic_supports_direct_msis(struct kvm *kvm)
{
- return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm);
+ return (kvm_vgic_global_state.has_gicv4_1 ||
+ (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm)));
}
/*
@@ -538,10 +539,12 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER,
- vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1,
+ vgic_mmio_read_enable, vgic_mmio_write_senable,
+ NULL, vgic_uaccess_write_senable, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER,
- vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1,
+ vgic_mmio_read_enable, vgic_mmio_write_cenable,
+ NULL, vgic_uaccess_write_cenable, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
vgic_mmio_read_pending, vgic_mmio_write_spending,
@@ -553,11 +556,11 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
vgic_mmio_read_active, vgic_mmio_write_sactive,
- NULL, vgic_mmio_uaccess_write_sactive, 1,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
vgic_mmio_read_active, vgic_mmio_write_cactive,
- NULL, vgic_mmio_uaccess_write_cactive,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive,
1, VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
@@ -609,11 +612,13 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = {
REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0,
vgic_mmio_read_group, vgic_mmio_write_group, 4,
VGIC_ACCESS_32bit),
- REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ISENABLER0,
- vgic_mmio_read_enable, vgic_mmio_write_senable, 4,
+ REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0,
+ vgic_mmio_read_enable, vgic_mmio_write_senable,
+ NULL, vgic_uaccess_write_senable, 4,
VGIC_ACCESS_32bit),
- REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICENABLER0,
- vgic_mmio_read_enable, vgic_mmio_write_cenable, 4,
+ REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0,
+ vgic_mmio_read_enable, vgic_mmio_write_cenable,
+ NULL, vgic_uaccess_write_cenable, 4,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0,
vgic_mmio_read_pending, vgic_mmio_write_spending,
@@ -625,12 +630,12 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = {
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0,
vgic_mmio_read_active, vgic_mmio_write_sactive,
- NULL, vgic_mmio_uaccess_write_sactive,
- 4, VGIC_ACCESS_32bit),
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4,
+ VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0,
vgic_mmio_read_active, vgic_mmio_write_cactive,
- NULL, vgic_mmio_uaccess_write_cactive,
- 4, VGIC_ACCESS_32bit),
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4,
+ VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0,
vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 2199302597fa..b2d73fc0d1ef 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -184,6 +184,48 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
}
}
+int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->enabled = true;
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return 0;
+}
+
+int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->enabled = false;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return 0;
+}
+
unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len)
{
@@ -219,17 +261,6 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
return value;
}
-/* Must be called with irq->irq_lock held */
-static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
- bool is_uaccess)
-{
- if (is_uaccess)
- return;
-
- irq->pending_latch = true;
- vgic_irq_set_phys_active(irq, true);
-}
-
static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
{
return (vgic_irq_is_sgi(irq->intid) &&
@@ -240,7 +271,6 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val)
{
- bool is_uaccess = !kvm_get_running_vcpu();
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
int i;
unsigned long flags;
@@ -270,22 +300,48 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
continue;
}
+ irq->pending_latch = true;
if (irq->hw)
- vgic_hw_irq_spending(vcpu, irq, is_uaccess);
- else
- irq->pending_latch = true;
+ vgic_irq_set_phys_active(irq, true);
+
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
vgic_put_irq(vcpu->kvm, irq);
}
}
-/* Must be called with irq->irq_lock held */
-static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
- bool is_uaccess)
+int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
{
- if (is_uaccess)
- return;
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->pending_latch = true;
+ /*
+ * GICv2 SGIs are terribly broken. We can't restore
+ * the source of the interrupt, so just pick the vcpu
+ * itself as the source...
+ */
+ if (is_vgic_v2_sgi(vcpu, irq))
+ irq->source |= BIT(vcpu->vcpu_id);
+
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return 0;
+}
+
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
+{
irq->pending_latch = false;
/*
@@ -308,7 +364,6 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val)
{
- bool is_uaccess = !kvm_get_running_vcpu();
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
int i;
unsigned long flags;
@@ -339,7 +394,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
}
if (irq->hw)
- vgic_hw_irq_cpending(vcpu, irq, is_uaccess);
+ vgic_hw_irq_cpending(vcpu, irq);
else
irq->pending_latch = false;
@@ -348,8 +403,68 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
}
}
-unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
- gpa_t addr, unsigned int len)
+int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ /*
+ * More fun with GICv2 SGIs! If we're clearing one of them
+ * from userspace, which source vcpu to clear? Let's not
+ * even think of it, and blow the whole set.
+ */
+ if (is_vgic_v2_sgi(vcpu, irq))
+ irq->source = 0;
+
+ irq->pending_latch = false;
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return 0;
+}
+
+/*
+ * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
+ * is not queued on some running VCPU's LRs, because then the change to the
+ * active state can be overwritten when the VCPU's state is synced coming back
+ * from the guest.
+ *
+ * For shared interrupts as well as GICv3 private interrupts, we have to
+ * stop all the VCPUs because interrupts can be migrated while we don't hold
+ * the IRQ locks and we don't want to be chasing moving targets.
+ *
+ * For GICv2 private interrupts we don't have to do anything because
+ * userspace accesses to the VGIC state already require all VCPUs to be
+ * stopped, and only the VCPU itself can modify its private interrupts
+ * active state, which guarantees that the VCPU is not running.
+ */
+static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
+{
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ intid >= VGIC_NR_PRIVATE_IRQS)
+ kvm_arm_halt_guest(vcpu->kvm);
+}
+
+/* See vgic_access_active_prepare */
+static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid)
+{
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ intid >= VGIC_NR_PRIVATE_IRQS)
+ kvm_arm_resume_guest(vcpu->kvm);
+}
+
+static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
{
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
u32 value = 0;
@@ -359,6 +474,10 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
for (i = 0; i < len * 8; i++) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ /*
+ * Even for HW interrupts, don't evaluate the HW state as
+ * all the guest is interested in is the virtual state.
+ */
if (irq->active)
value |= (1U << i);
@@ -368,6 +487,29 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
return value;
}
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ u32 val;
+
+ mutex_lock(&vcpu->kvm->lock);
+ vgic_access_active_prepare(vcpu, intid);
+
+ val = __vgic_mmio_read_active(vcpu, addr, len);
+
+ vgic_access_active_finish(vcpu, intid);
+ mutex_unlock(&vcpu->kvm->lock);
+
+ return val;
+}
+
+unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ return __vgic_mmio_read_active(vcpu, addr, len);
+}
+
/* Must be called with irq->irq_lock held */
static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
bool active, bool is_uaccess)
@@ -426,36 +568,6 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
}
-/*
- * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
- * is not queued on some running VCPU's LRs, because then the change to the
- * active state can be overwritten when the VCPU's state is synced coming back
- * from the guest.
- *
- * For shared interrupts, we have to stop all the VCPUs because interrupts can
- * be migrated while we don't hold the IRQ locks and we don't want to be
- * chasing moving targets.
- *
- * For private interrupts we don't have to do anything because userspace
- * accesses to the VGIC state already require all VCPUs to be stopped, and
- * only the VCPU itself can modify its private interrupts active state, which
- * guarantees that the VCPU is not running.
- */
-static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
-{
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
- intid > VGIC_NR_PRIVATE_IRQS)
- kvm_arm_halt_guest(vcpu->kvm);
-}
-
-/* See vgic_change_active_prepare */
-static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid)
-{
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
- intid > VGIC_NR_PRIVATE_IRQS)
- kvm_arm_resume_guest(vcpu->kvm);
-}
-
static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val)
@@ -477,11 +589,11 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
mutex_lock(&vcpu->kvm->lock);
- vgic_change_active_prepare(vcpu, intid);
+ vgic_access_active_prepare(vcpu, intid);
__vgic_mmio_write_cactive(vcpu, addr, len, val);
- vgic_change_active_finish(vcpu, intid);
+ vgic_access_active_finish(vcpu, intid);
mutex_unlock(&vcpu->kvm->lock);
}
@@ -514,11 +626,11 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
mutex_lock(&vcpu->kvm->lock);
- vgic_change_active_prepare(vcpu, intid);
+ vgic_access_active_prepare(vcpu, intid);
__vgic_mmio_write_sactive(vcpu, addr, len, val);
- vgic_change_active_finish(vcpu, intid);
+ vgic_access_active_finish(vcpu, intid);
mutex_unlock(&vcpu->kvm->lock);
}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index 5af2aefad435..fefcca2b14dc 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -138,6 +138,14 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val);
+int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len);
@@ -149,9 +157,20 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val);
+int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len);
+unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val);