aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/locking.txt31
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c16
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c138
-rw-r--r--arch/powerpc/kvm/book3s_pr.c130
-rw-r--r--arch/powerpc/kvm/book3s_xics.c192
-rw-r--r--arch/powerpc/kvm/book3s_xics.h7
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/kvm_emulate.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h25
-rw-r--r--arch/x86/include/asm/vmx.h28
-rw-r--r--arch/x86/kernel/fpu/xstate.c1
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/emulate.c20
-rw-r--r--arch/x86/kvm/hyperv.c4
-rw-r--r--arch/x86/kvm/i8259.c16
-rw-r--r--arch/x86/kvm/irq.h19
-rw-r--r--arch/x86/kvm/irq_comm.c29
-rw-r--r--arch/x86/kvm/lapic.c135
-rw-r--r--arch/x86/kvm/lapic.h12
-rw-r--r--arch/x86/kvm/mmu.c489
-rw-r--r--arch/x86/kvm/svm.c2
-rw-r--r--arch/x86/kvm/vmx.c62
-rw-r--r--arch/x86/kvm/x86.c92
-rw-r--r--tools/arch/x86/include/asm/cpufeatures.h1
25 files changed, 951 insertions, 505 deletions
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index fd013bf4115b..1bb8bcaf8497 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -26,9 +26,16 @@ sections.
Fast page fault:
Fast page fault is the fast path which fixes the guest page fault out of
-the mmu-lock on x86. Currently, the page fault can be fast only if the
-shadow page table is present and it is caused by write-protect, that means
-we just need change the W bit of the spte.
+the mmu-lock on x86. Currently, the page fault can be fast in one of the
+following two cases:
+
+1. Access Tracking: The SPTE is not present, but it is marked for access
+tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to
+restore the saved R/X bits. This is described in more detail later below.
+
+2. Write-Protection: The SPTE is present and the fault is
+caused by write-protect. That means we just need to change the W bit of the
+spte.
What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
SPTE_MMU_WRITEABLE bit on the spte:
@@ -38,7 +45,8 @@ SPTE_MMU_WRITEABLE bit on the spte:
page write-protection.
On fast page fault path, we will use cmpxchg to atomically set the spte W
-bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this
+bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or
+restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This
is safe because whenever changing these bits can be detected by cmpxchg.
But we need carefully check these cases:
@@ -142,6 +150,21 @@ Since the spte is "volatile" if it can be updated out of mmu-lock, we always
atomically update the spte, the race caused by fast page fault can be avoided,
See the comments in spte_has_volatile_bits() and mmu_spte_update().
+Lockless Access Tracking:
+
+This is used for Intel CPUs that are using EPT but do not support the EPT A/D
+bits. In this case, when the KVM MMU notifier is called to track accesses to a
+page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present
+by clearing the RWX bits in the PTE and storing the original R & X bits in
+some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the
+PTE (using the ignored bit 62). When the VM tries to access the page later on,
+a fault is generated and the fast page fault mechanism described above is used
+to atomically restore the PTE to a Present state. The W bit is not saved when
+the PTE is marked for access tracking and during restoration to the Present
+state, the W bit is set depending on whether or not it was a write access. If
+it wasn't, then the W bit will remain clear until a write access happens, at
+which time it will be set using the Dirty tracking mechanism described above.
+
3. Reference
------------
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index cc0908b6c2a0..4edbe4bb0e8b 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -633,5 +633,7 @@ struct kvm_ppc_rmmu_info {
#define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40)
#define KVM_XICS_MASKED (1ULL << 41)
#define KVM_XICS_PENDING (1ULL << 42)
+#define KVM_XICS_PRESENTED (1ULL << 43)
+#define KVM_XICS_QUEUED (1ULL << 44)
#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e4a79679342e..bdf281cc88c0 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -182,7 +182,8 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
++vcpu->stat.halt_wakeup;
}
- if (kvmppc_ipi_thread(vcpu->arch.thread_cpu))
+ cpu = READ_ONCE(vcpu->arch.thread_cpu);
+ if (cpu >= 0 && kvmppc_ipi_thread(cpu))
return;
/* CPU points to the first thread of the core */
@@ -773,12 +774,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
}
tvcpu->arch.prodded = 1;
smp_mb();
- if (vcpu->arch.ceded) {
- if (swait_active(&vcpu->wq)) {
- swake_up(&vcpu->wq);
- vcpu->stat.halt_wakeup++;
- }
- }
+ if (tvcpu->arch.ceded)
+ kvmppc_fast_vcpu_kick_hv(tvcpu);
break;
case H_CONFER:
target = kvmppc_get_gpr(vcpu, 4);
@@ -2665,7 +2662,8 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
int i;
for_each_runnable_thread(i, vcpu, vc) {
- if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+ if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded ||
+ vcpu->arch.prodded)
return 1;
}
@@ -2851,7 +2849,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
break;
n_ceded = 0;
for_each_runnable_thread(i, v, vc) {
- if (!v->arch.pending_exceptions)
+ if (!v->arch.pending_exceptions && !v->arch.prodded)
n_ceded += v->arch.ceded;
else
v->arch.ceded = 0;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 7e2eb3e865b3..0b2e388f4cdf 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -35,7 +35,7 @@ int kvm_irq_bypass = 1;
EXPORT_SYMBOL(kvm_irq_bypass);
static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq);
+ u32 new_irq, bool check_resend);
static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu);
/* -- ICS routines -- */
@@ -44,20 +44,12 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,
{
int i;
- arch_spin_lock(&ics->lock);
-
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
struct ics_irq_state *state = &ics->irq_state[i];
-
- if (!state->resend)
- continue;
-
- arch_spin_unlock(&ics->lock);
- icp_rm_deliver_irq(xics, icp, state->number);
- arch_spin_lock(&ics->lock);
+ if (state->resend)
+ icp_rm_deliver_irq(xics, icp, state->number, true);
}
- arch_spin_unlock(&ics->lock);
}
/* -- ICP routines -- */
@@ -288,7 +280,7 @@ static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
}
static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq)
+ u32 new_irq, bool check_resend)
{
struct ics_irq_state *state;
struct kvmppc_ics *ics;
@@ -333,6 +325,10 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
}
}
+ if (check_resend)
+ if (!state->resend)
+ goto out;
+
/* Clear the resend bit of that interrupt */
state->resend = 0;
@@ -378,7 +374,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
*/
if (reject && reject != XICS_IPI) {
arch_spin_unlock(&ics->lock);
+ icp->n_reject++;
new_irq = reject;
+ check_resend = 0;
goto again;
}
} else {
@@ -386,10 +384,16 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
* We failed to deliver the interrupt we need to set the
* resend map bit and mark the ICS state as needing a resend
*/
- set_bit(ics->icsid, icp->resend_map);
state->resend = 1;
/*
+ * Make sure when checking resend, we don't miss the resend
+ * if resend_map bit is seen and cleared.
+ */
+ smp_wmb();
+ set_bit(ics->icsid, icp->resend_map);
+
+ /*
* If the need_resend flag got cleared in the ICP some time
* between icp_rm_try_to_deliver() atomic update and now, then
* we know it might have missed the resend_map bit. So we
@@ -397,7 +401,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
*/
smp_mb();
if (!icp->state.need_resend) {
+ state->resend = 0;
arch_spin_unlock(&ics->lock);
+ check_resend = 0;
goto again;
}
}
@@ -592,7 +598,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
/* Handle reject in real mode */
if (reject && reject != XICS_IPI) {
this_icp->n_reject++;
- icp_rm_deliver_irq(xics, icp, reject);
+ icp_rm_deliver_irq(xics, icp, reject, false);
}
/* Handle resends in real mode */
@@ -660,59 +666,45 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
*/
if (reject && reject != XICS_IPI) {
icp->n_reject++;
- icp_rm_deliver_irq(xics, icp, reject);
+ icp_rm_deliver_irq(xics, icp, reject, false);
}
bail:
return check_too_hard(xics, icp);
}
-int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
{
struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
struct kvmppc_icp *icp = vcpu->arch.icp;
struct kvmppc_ics *ics;
struct ics_irq_state *state;
- u32 irq = xirr & 0x00ffffff;
u16 src;
-
- if (!xics || !xics->real_mode)
- return H_TOO_HARD;
+ u32 pq_old, pq_new;
/*
- * ICP State: EOI
+ * ICS EOI handling: For LSI, if P bit is still set, we need to
+ * resend it.
*
- * Note: If EOI is incorrectly used by SW to lower the CPPR
- * value (ie more favored), we do not check for rejection of
- * a pending interrupt, this is a SW error and PAPR sepcifies
- * that we don't have to deal with it.
- *
- * The sending of an EOI to the ICS is handled after the
- * CPPR update
- *
- * ICP State: Down_CPPR which we handle
- * in a separate function as it's shared with H_CPPR.
+ * For MSI, we move Q bit into P (and clear Q). If it is set,
+ * resend it.
*/
- icp_rm_down_cppr(xics, icp, xirr >> 24);
- /* IPIs have no EOI */
- if (irq == XICS_IPI)
- goto bail;
- /*
- * EOI handling: If the interrupt is still asserted, we need to
- * resend it. We can take a lockless "peek" at the ICS state here.
- *
- * "Message" interrupts will never have "asserted" set
- */
ics = kvmppc_xics_find_ics(xics, irq, &src);
if (!ics)
goto bail;
+
state = &ics->irq_state[src];
- /* Still asserted, resend it */
- if (state->asserted) {
- icp->n_reject++;
- icp_rm_deliver_irq(xics, icp, irq);
- }
+ if (state->lsi)
+ pq_new = state->pq_state;
+ else
+ do {
+ pq_old = state->pq_state;
+ pq_new = pq_old >> 1;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ if (pq_new & PQ_PRESENTED)
+ icp_rm_deliver_irq(xics, NULL, irq, false);
if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
icp->rm_action |= XICS_RM_NOTIFY_EOI;
@@ -733,10 +725,43 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
state->intr_cpu = -1;
}
}
+
bail:
return check_too_hard(xics, icp);
}
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ u32 irq = xirr & 0x00ffffff;
+
+ if (!xics || !xics->real_mode)
+ return H_TOO_HARD;
+
+ /*
+ * ICP State: EOI
+ *
+ * Note: If EOI is incorrectly used by SW to lower the CPPR
+ * value (ie more favored), we do not check for rejection of
+ * a pending interrupt, this is a SW error and PAPR specifies
+ * that we don't have to deal with it.
+ *
+ * The sending of an EOI to the ICS is handled after the
+ * CPPR update
+ *
+ * ICP State: Down_CPPR which we handle
+ * in a separate function as it's shared with H_CPPR.
+ */
+ icp_rm_down_cppr(xics, icp, xirr >> 24);
+
+ /* IPIs have no EOI */
+ if (irq == XICS_IPI)
+ return check_too_hard(xics, icp);
+
+ return ics_rm_eoi(vcpu, irq);
+}
+
unsigned long eoi_rc;
static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
@@ -823,14 +848,33 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
{
struct kvmppc_xics *xics;
struct kvmppc_icp *icp;
+ struct kvmppc_ics *ics;
+ struct ics_irq_state *state;
u32 irq;
+ u16 src;
+ u32 pq_old, pq_new;
irq = irq_map->v_hwirq;
xics = vcpu->kvm->arch.xics;
icp = vcpu->arch.icp;
kvmppc_rm_handle_irq_desc(irq_map->desc);
- icp_rm_deliver_irq(xics, icp, irq);
+
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics)
+ return 2;
+
+ state = &ics->irq_state[src];
+
+ /* only MSIs register bypass producers, so it must be MSI here */
+ do {
+ pq_old = state->pq_state;
+ pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ /* Test P=1, Q=0, this is the only case where we present */
+ if (pq_new == PQ_PRESENTED)
+ icp_rm_deliver_irq(xics, icp, irq, false);
/* EOI the interrupt */
icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 1482961ceb4d..d4dfc0ca2a44 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -902,6 +902,69 @@ static void kvmppc_clear_debug(struct kvm_vcpu *vcpu)
}
}
+static int kvmppc_exit_pr_progint(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ unsigned int exit_nr)
+{
+ enum emulation_result er;
+ ulong flags;
+ u32 last_inst;
+ int emul, r;
+
+ /*
+ * shadow_srr1 only contains valid flags if we came here via a program
+ * exception. The other exceptions (emulation assist, FP unavailable,
+ * etc.) do not provide flags in SRR1, so use an illegal-instruction
+ * exception when injecting a program interrupt into the guest.
+ */
+ if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+ flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+ else
+ flags = SRR1_PROGILL;
+
+ emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+ if (emul != EMULATE_DONE)
+ return RESUME_GUEST;
+
+ if (kvmppc_get_msr(vcpu) & MSR_PR) {
+#ifdef EXIT_DEBUG
+ pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
+ kvmppc_get_pc(vcpu), last_inst);
+#endif
+ if ((last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) {
+ kvmppc_core_queue_program(vcpu, flags);
+ return RESUME_GUEST;
+ }
+ }
+
+ vcpu->stat.emulated_inst_exits++;
+ er = kvmppc_emulate_instruction(run, vcpu);
+ switch (er) {
+ case EMULATE_DONE:
+ r = RESUME_GUEST_NV;
+ break;
+ case EMULATE_AGAIN:
+ r = RESUME_GUEST;
+ break;
+ case EMULATE_FAIL:
+ pr_crit("%s: emulation at %lx failed (%08x)\n",
+ __func__, kvmppc_get_pc(vcpu), last_inst);
+ kvmppc_core_queue_program(vcpu, flags);
+ r = RESUME_GUEST;
+ break;
+ case EMULATE_DO_MMIO:
+ run->exit_reason = KVM_EXIT_MMIO;
+ r = RESUME_HOST_NV;
+ break;
+ case EMULATE_EXIT_USER:
+ r = RESUME_HOST_NV;
+ break;
+ default:
+ BUG();
+ }
+
+ return r;
+}
+
int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int exit_nr)
{
@@ -1044,71 +1107,8 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
case BOOK3S_INTERRUPT_PROGRAM:
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
- {
- enum emulation_result er;
- ulong flags;
- u32 last_inst;
- int emul;
-
-program_interrupt:
- /*
- * shadow_srr1 only contains valid flags if we came here via
- * a program exception. The other exceptions (emulation assist,
- * FP unavailable, etc.) do not provide flags in SRR1, so use
- * an illegal-instruction exception when injecting a program
- * interrupt into the guest.
- */
- if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
- flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
- else
- flags = SRR1_PROGILL;
-
- emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
- if (emul != EMULATE_DONE) {
- r = RESUME_GUEST;
- break;
- }
-
- if (kvmppc_get_msr(vcpu) & MSR_PR) {
-#ifdef EXIT_DEBUG
- pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
- kvmppc_get_pc(vcpu), last_inst);
-#endif
- if ((last_inst & 0xff0007ff) !=
- (INS_DCBZ & 0xfffffff7)) {
- kvmppc_core_queue_program(vcpu, flags);
- r = RESUME_GUEST;
- break;
- }
- }
-
- vcpu->stat.emulated_inst_exits++;
- er = kvmppc_emulate_instruction(run, vcpu);
- switch (er) {
- case EMULATE_DONE:
- r = RESUME_GUEST_NV;
- break;
- case EMULATE_AGAIN:
- r = RESUME_GUEST;
- break;
- case EMULATE_FAIL:
- printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
- __func__, kvmppc_get_pc(vcpu), last_inst);
- kvmppc_core_queue_program(vcpu, flags);
- r = RESUME_GUEST;
- break;
- case EMULATE_DO_MMIO:
- run->exit_reason = KVM_EXIT_MMIO;
- r = RESUME_HOST_NV;
- break;
- case EMULATE_EXIT_USER:
- r = RESUME_HOST_NV;
- break;
- default:
- BUG();
- }
+ r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
break;
- }
case BOOK3S_INTERRUPT_SYSCALL:
{
u32 last_sc;
@@ -1185,7 +1185,7 @@ program_interrupt:
emul = kvmppc_get_last_inst(vcpu, INST_GENERIC,
&last_inst);
if (emul == EMULATE_DONE)
- goto program_interrupt;
+ r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
else
r = RESUME_GUEST;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 20dff102a06f..e48803e2918d 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -63,7 +63,7 @@
/* -- ICS routines -- */
static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq);
+ u32 new_irq, bool check_resend);
/*
* Return value ideally indicates how the interrupt was handled, but no
@@ -75,6 +75,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
struct ics_irq_state *state;
struct kvmppc_ics *ics;
u16 src;
+ u32 pq_old, pq_new;
XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
@@ -87,25 +88,41 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
if (!state->exists)
return -EINVAL;
+ if (level == KVM_INTERRUPT_SET_LEVEL || level == KVM_INTERRUPT_SET)
+ level = 1;
+ else if (level == KVM_INTERRUPT_UNSET)
+ level = 0;
/*
- * We set state->asserted locklessly. This should be fine as
- * we are the only setter, thus concurrent access is undefined
- * to begin with.
+ * Take other values the same as 1, consistent with original code.
+ * maybe WARN here?
*/
- if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
- state->asserted = 1;
- else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
- state->asserted = 0;
+
+ if (!state->lsi && level == 0) /* noop for MSI */
return 0;
- }
+
+ do {
+ pq_old = state->pq_state;
+ if (state->lsi) {
+ if (level) {
+ if (pq_old & PQ_PRESENTED)
+ /* Setting already set LSI ... */
+ return 0;
+
+ pq_new = PQ_PRESENTED;
+ } else
+ pq_new = 0;
+ } else
+ pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ /* Test P=1, Q=0, this is the only case where we present */
+ if (pq_new == PQ_PRESENTED)
+ icp_deliver_irq(xics, NULL, irq, false);
/* Record which CPU this arrived on for passed-through interrupts */
if (state->host_irq)
state->intr_cpu = raw_smp_processor_id();
- /* Attempt delivery */
- icp_deliver_irq(xics, NULL, irq);
-
return 0;
}
@@ -114,29 +131,14 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
{
int i;
- unsigned long flags;
-
- local_irq_save(flags);
- arch_spin_lock(&ics->lock);
-
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
struct ics_irq_state *state = &ics->irq_state[i];
-
- if (!state->resend)
- continue;
-
- XICS_DBG("resend %#x prio %#x\n", state->number,
- state->priority);
-
- arch_spin_unlock(&ics->lock);
- local_irq_restore(flags);
- icp_deliver_irq(xics, icp, state->number);
- local_irq_save(flags);
- arch_spin_lock(&ics->lock);
+ if (state->resend) {
+ XICS_DBG("resend %#x prio %#x\n", state->number,
+ state->priority);
+ icp_deliver_irq(xics, icp, state->number, true);
+ }
}
-
- arch_spin_unlock(&ics->lock);
- local_irq_restore(flags);
}
static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -155,6 +157,7 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
deliver = false;
if ((state->masked_pending || state->resend) && priority != MASKED) {
state->masked_pending = 0;
+ state->resend = 0;
deliver = true;
}
@@ -189,7 +192,7 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
state->masked_pending, state->resend);
if (write_xive(xics, ics, state, server, priority, priority))
- icp_deliver_irq(xics, icp, irq);
+ icp_deliver_irq(xics, icp, irq, false);
return 0;
}
@@ -242,7 +245,7 @@ int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
if (write_xive(xics, ics, state, state->server, state->saved_priority,
state->saved_priority))
- icp_deliver_irq(xics, icp, irq);
+ icp_deliver_irq(xics, icp, irq, false);
return 0;
}
@@ -376,7 +379,7 @@ static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
}
static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq)
+ u32 new_irq, bool check_resend)
{
struct ics_irq_state *state;
struct kvmppc_ics *ics;
@@ -422,6 +425,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
}
}
+ if (check_resend)
+ if (!state->resend)
+ goto out;
+
/* Clear the resend bit of that interrupt */
state->resend = 0;
@@ -470,6 +477,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
arch_spin_unlock(&ics->lock);
local_irq_restore(flags);
new_irq = reject;
+ check_resend = 0;
goto again;
}
} else {
@@ -477,10 +485,16 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
* We failed to deliver the interrupt we need to set the
* resend map bit and mark the ICS state as needing a resend
*/
- set_bit(ics->icsid, icp->resend_map);
state->resend = 1;
/*
+ * Make sure when checking resend, we don't miss the resend
+ * if resend_map bit is seen and cleared.
+ */
+ smp_wmb();
+ set_bit(ics->icsid, icp->resend_map);
+
+ /*
* If the need_resend flag got cleared in the ICP some time
* between icp_try_to_deliver() atomic update and now, then
* we know it might have missed the resend_map bit. So we
@@ -488,8 +502,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
*/
smp_mb();
if (!icp->state.need_resend) {
+ state->resend = 0;
arch_spin_unlock(&ics->lock);
local_irq_restore(flags);
+ check_resend = 0;
goto again;
}
}
@@ -681,7 +697,7 @@ static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
/* Handle reject */
if (reject && reject != XICS_IPI)
- icp_deliver_irq(xics, icp, reject);
+ icp_deliver_irq(xics, icp, reject, false);
/* Handle resend */
if (resend)
@@ -761,17 +777,54 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
* attempt (see comments in icp_deliver_irq).
*/
if (reject && reject != XICS_IPI)
- icp_deliver_irq(xics, icp, reject);
+ icp_deliver_irq(xics, icp, reject, false);
}
-static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq)
{
struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
struct kvmppc_icp *icp = vcpu->arch.icp;
struct kvmppc_ics *ics;
struct ics_irq_state *state;
- u32 irq = xirr & 0x00ffffff;
u16 src;
+ u32 pq_old, pq_new;
+
+ /*
+ * ICS EOI handling: For LSI, if P bit is still set, we need to
+ * resend it.
+ *
+ * For MSI, we move Q bit into P (and clear Q). If it is set,
+ * resend it.
+ */
+
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics) {
+ XICS_DBG("ios_eoi: IRQ 0x%06x not found !\n", irq);
+ return H_PARAMETER;
+ }
+ state = &ics->irq_state[src];
+
+ if (state->lsi)
+ pq_new = state->pq_state;
+ else
+ do {
+ pq_old = state->pq_state;
+ pq_new = pq_old >> 1;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ if (pq_new & PQ_PRESENTED)
+ icp_deliver_irq(xics, icp, irq, false);
+
+ kvm_notify_acked_irq(vcpu->kvm, 0, irq);
+
+ return H_SUCCESS;
+}
+
+static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ u32 irq = xirr & 0x00ffffff;
XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
@@ -794,26 +847,8 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
/* IPIs have no EOI */
if (irq == XICS_IPI)
return H_SUCCESS;
- /*
- * EOI handling: If the interrupt is still asserted, we need to
- * resend it. We can take a lockless "peek" at the ICS state here.
- *
- * "Message" interrupts will never have "asserted" set
- */
- ics = kvmppc_xics_find_ics(xics, irq, &src);
- if (!ics) {
- XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
- return H_PARAMETER;
- }
- state = &ics->irq_state[src];
- /* Still asserted, resend it */
- if (state->asserted)
- icp_deliver_irq(xics, icp, irq);
-
- kvm_notify_acked_irq(vcpu->kvm, 0, irq);
-
- return H_SUCCESS;
+ return ics_eoi(vcpu, irq);
}
int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
@@ -832,10 +867,6 @@ int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
icp->n_rm_check_resend++;
icp_check_resend(xics, icp->rm_resend_icp);
}
- if (icp->rm_action & XICS_RM_REJECT) {
- icp->n_rm_reject++;
- icp_deliver_irq(xics, icp, icp->rm_reject);
- }
if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
icp->n_rm_notify_eoi++;
kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
@@ -920,7 +951,7 @@ static int xics_debug_show(struct seq_file *m, void *private)
int icsid, i;
unsigned long flags;
unsigned long t_rm_kick_vcpu, t_rm_check_resend;
- unsigned long t_rm_reject, t_rm_notify_eoi;
+ unsigned long t_rm_notify_eoi;
unsigned long t_reject, t_check_resend;
if (!kvm)
@@ -929,7 +960,6 @@ static int xics_debug_show(struct seq_file *m, void *private)
t_rm_kick_vcpu = 0;
t_rm_notify_eoi = 0;
t_rm_check_resend = 0;
- t_rm_reject = 0;
t_check_resend = 0;
t_reject = 0;
@@ -952,14 +982,13 @@ static int xics_debug_show(struct seq_file *m, void *private)
t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
t_rm_notify_eoi += icp->n_rm_notify_eoi;
t_rm_check_resend += icp->n_rm_check_resend;
- t_rm_reject += icp->n_rm_reject;
t_check_resend += icp->n_check_resend;
t_reject += icp->n_reject;
}
- seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n",
+ seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu notify_eoi=%lu\n",
t_rm_kick_vcpu, t_rm_check_resend,
- t_rm_reject, t_rm_notify_eoi);
+ t_rm_notify_eoi);
seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
t_check_resend, t_reject);
for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
@@ -977,9 +1006,9 @@ static int xics_debug_show(struct seq_file *m, void *private)
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
struct ics_irq_state *irq = &ics->irq_state[i];
- seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n",
+ seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x pq_state %d resend %d masked pending %d\n",
irq->number, irq->server, irq->priority,
- irq->saved_priority, irq->asserted,
+ irq->saved_priority, irq->pq_state,
irq->resend, irq->masked_pending);
}
@@ -1198,10 +1227,17 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
val |= prio << KVM_XICS_PRIORITY_SHIFT;
if (irqp->lsi) {
val |= KVM_XICS_LEVEL_SENSITIVE;
- if (irqp->asserted)
+ if (irqp->pq_state & PQ_PRESENTED)
val |= KVM_XICS_PENDING;
} else if (irqp->masked_pending || irqp->resend)
val |= KVM_XICS_PENDING;
+
+ if (irqp->pq_state & PQ_PRESENTED)
+ val |= KVM_XICS_PRESENTED;
+
+ if (irqp->pq_state & PQ_QUEUED)
+ val |= KVM_XICS_QUEUED;
+
ret = 0;
}
arch_spin_unlock(&ics->lock);
@@ -1253,18 +1289,20 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
irqp->resend = 0;
irqp->masked_pending = 0;
irqp->lsi = 0;
- irqp->asserted = 0;
- if (val & KVM_XICS_LEVEL_SENSITIVE) {
+ irqp->pq_state = 0;
+ if (val & KVM_XICS_LEVEL_SENSITIVE)
irqp->lsi = 1;
- if (val & KVM_XICS_PENDING)
- irqp->asserted = 1;
- }
+ /* If PENDING, set P in case P is not saved because of old code */
+ if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+ irqp->pq_state |= PQ_PRESENTED;
+ if (val & KVM_XICS_QUEUED)
+ irqp->pq_state |= PQ_QUEUED;
irqp->exists = 1;
arch_spin_unlock(&ics->lock);
local_irq_restore(flags);
if (val & KVM_XICS_PENDING)
- icp_deliver_irq(xics, NULL, irqp->number);
+ icp_deliver_irq(xics, NULL, irqp->number, false);
return 0;
}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index 2a50320b55ca..ec5474cf70c6 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -31,16 +31,19 @@
/* Priority value to use for disabling an interrupt */
#define MASKED 0xff
+#define PQ_PRESENTED 1
+#define PQ_QUEUED 2
+
/* State for one irq source */
struct ics_irq_state {
u32 number;
u32 server;
+ u32 pq_state;
u8 priority;
u8 saved_priority;
u8 resend;
u8 masked_pending;
u8 lsi; /* level-sensitive interrupt */
- u8 asserted; /* Only for LSI */
u8 exists;
int intr_cpu;
u32 host_irq;
@@ -73,7 +76,6 @@ struct kvmppc_icp {
*/
#define XICS_RM_KICK_VCPU 0x1
#define XICS_RM_CHECK_RESEND 0x2
-#define XICS_RM_REJECT 0x4
#define XICS_RM_NOTIFY_EOI 0x8
u32 rm_action;
struct kvm_vcpu *rm_kick_target;
@@ -84,7 +86,6 @@ struct kvmppc_icp {
/* Counters for each reason we exited real mode */
unsigned long n_rm_kick_vcpu;
unsigned long n_rm_check_resend;
- unsigned long n_rm_reject;
unsigned long n_rm_notify_eoi;
/* Counters for handling ICP processing in real mode */
unsigned long n_check_resend;
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index eafee3161d1c..d9d7136edf05 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -288,6 +288,7 @@
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */
/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
@@ -320,5 +321,4 @@
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
-
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e9cd7befcb76..3e8c287090e4 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -441,5 +441,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a7066dc1a7e9..417502cf42b6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -115,7 +115,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define KVM_PERMILLE_MMU_PAGES 20
#define KVM_MIN_ALLOC_MMU_PAGES 64
-#define KVM_MMU_HASH_SHIFT 10
+#define KVM_MMU_HASH_SHIFT 12
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
@@ -208,6 +208,13 @@ enum {
PFERR_WRITE_MASK | \
PFERR_PRESENT_MASK)
+/*
+ * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
+ * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting
+ * with the SVE bit in EPT PTEs.
+ */
+#define SPTE_SPECIAL_MASK (1ULL << 62)
+
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
/*
@@ -668,6 +675,9 @@ struct kvm_vcpu_arch {
int pending_ioapic_eoi;
int pending_external_vector;
+
+ /* GPA available (AMD only) */
+ bool gpa_available;
};
struct kvm_lpage_info {
@@ -716,6 +726,12 @@ struct kvm_hv {
HV_REFERENCE_TSC_PAGE tsc_ref;
};
+enum kvm_irqchip_mode {
+ KVM_IRQCHIP_NONE,
+ KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
+ KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
+};
+
struct kvm_arch {
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
@@ -788,7 +804,7 @@ struct kvm_arch {
u64 disabled_quirks;
- bool irqchip_split;
+ enum kvm_irqchip_mode irqchip_mode;
u8 nr_reserved_ioapic_pins;
bool disabled_lapic_found;
@@ -815,6 +831,7 @@ struct kvm_vm_stat {
ulong mmu_unsync;
ulong remote_tlb_flush;
ulong lpages;
+ ulong max_mmu_page_hash_collisions;
};
struct kvm_vcpu_stat {
@@ -844,6 +861,7 @@ struct kvm_vcpu_stat {
u64 hypercalls;
u64 irq_injections;
u64 nmi_injections;
+ u64 req_event;
};
struct x86_instruction_info;
@@ -1050,7 +1068,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+ u64 acc_track_mask);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b5b2d4b924e..a22a4790f1ac 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -467,8 +467,16 @@ enum vmcs_field {
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
#define VMX_EPT_IPAT_BIT (1ull << 6)
-#define VMX_EPT_ACCESS_BIT (1ull << 8)
-#define VMX_EPT_DIRTY_BIT (1ull << 9)
+#define VMX_EPT_ACCESS_BIT (1ull << 8)
+#define VMX_EPT_DIRTY_BIT (1ull << 9)
+#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
+ VMX_EPT_WRITABLE_MASK | \
+ VMX_EPT_EXECUTABLE_MASK)
+#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT)
+
+/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
+#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \
+ VMX_EPT_EXECUTABLE_MASK)
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
@@ -500,6 +508,22 @@ struct vmx_msr_entry {
#define ENTRY_FAIL_VMCS_LINK_PTR 4
/*
+ * Exit Qualifications for EPT Violations
+ */
+#define EPT_VIOLATION_READ_BIT 0
+#define EPT_VIOLATION_WRITE_BIT 1
+#define EPT_VIOLATION_INSTR_BIT 2
+#define EPT_VIOLATION_READABLE_BIT 3
+#define EPT_VIOLATION_WRITABLE_BIT 4
+#define EPT_VIOLATION_EXECUTABLE_BIT 5
+#define EPT_VIOLATION_READ (1 << EPT_VIOLATION_READ_BIT)
+#define EPT_VIOLATION_WRITE (1 << EPT_VIOLATION_WRITE_BIT)
+#define EPT_VIOLATION_INSTR (1 << EPT_VIOLATION_INSTR_BIT)
+#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT)
+#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT)
+#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT)
+
+/*
* VM-instruction error numbers
*/
enum vm_instruction_error_number {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 1d7770447b3e..35f7024aace5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -78,6 +78,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_PKU);
setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
}
/*
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e85f6bd7b9d5..09c2ac741567 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -383,7 +383,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
- F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/;
+ F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cedbba0f3402..45c7306c8780 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -173,6 +173,7 @@
#define NearBranch ((u64)1 << 52) /* Near branches */
#define No16 ((u64)1 << 53) /* No 16 bit operand */
#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
+#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -4298,7 +4299,7 @@ static const struct opcode group1[] = {
};
static const struct opcode group1A[] = {
- I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
+ I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N,
};
static const struct opcode group2[] = {
@@ -4336,7 +4337,7 @@ static const struct opcode group5[] = {
I(SrcMemFAddr | ImplicitOps, em_call_far),
I(SrcMem | NearBranch, em_jmp_abs),
I(SrcMemFAddr | ImplicitOps, em_jmp_far),
- I(SrcMem | Stack, em_push), D(Undefined),
+ I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
};
static const struct opcode group6[] = {
@@ -4556,8 +4557,8 @@ static const struct opcode opcode_table[256] = {
/* 0xA0 - 0xA7 */
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
- I2bv(SrcSI | DstDI | Mov | String, em_mov),
- F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r),
+ I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov),
+ F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r),
/* 0xA8 - 0xAF */
F2bv(DstAcc | SrcImm | NoWrite, em_test),
I2bv(SrcAcc | DstDI | Mov | String, em_mov),
@@ -5671,3 +5672,14 @@ void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
{
writeback_registers(ctxt);
}
+
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ return false;
+
+ if (ctxt->d & TwoMemOp)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 1572c35b4f1a..08b27e0c7b71 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -305,13 +305,13 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
return -ENOENT;
memset(&irq, 0, sizeof(irq));
- irq.dest_id = kvm_apic_id(vcpu->arch.apic);
+ irq.shorthand = APIC_DEST_SELF;
irq.dest_mode = APIC_DEST_PHYSICAL;
irq.delivery_mode = APIC_DM_FIXED;
irq.vector = vector;
irq.level = 1;
- ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL);
+ ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
return ret;
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 7cc2360f1848..73ea24d4f119 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -598,14 +598,14 @@ static const struct kvm_io_device_ops picdev_eclr_ops = {
.write = picdev_eclr_write,
};
-struct kvm_pic *kvm_create_pic(struct kvm *kvm)
+int kvm_pic_init(struct kvm *kvm)
{
struct kvm_pic *s;
int ret;
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
if (!s)
- return NULL;
+ return -ENOMEM;
spin_lock_init(&s->lock);
s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
@@ -635,7 +635,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
mutex_unlock(&kvm->slots_lock);
- return s;
+ kvm->arch.vpic = s;
+
+ return 0;
fail_unreg_1:
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
@@ -648,13 +650,17 @@ fail_unlock:
kfree(s);
- return NULL;
+ return ret;
}
-void kvm_destroy_pic(struct kvm_pic *vpic)
+void kvm_pic_destroy(struct kvm *kvm)
{
+ struct kvm_pic *vpic = kvm->arch.vpic;
+
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+
+ kvm->arch.vpic = NULL;
kfree(vpic);
}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 035731eb3897..40d5b2cf6061 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -73,8 +73,8 @@ struct kvm_pic {
unsigned long irq_states[PIC_NUM_PINS];
};
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_destroy_pic(struct kvm_pic *vpic);
+int kvm_pic_init(struct kvm *kvm);
+void kvm_pic_destroy(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
@@ -93,18 +93,19 @@ static inline int pic_in_kernel(struct kvm *kvm)
static inline int irqchip_split(struct kvm *kvm)
{
- return kvm->arch.irqchip_split;
+ return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT;
}
-static inline int irqchip_in_kernel(struct kvm *kvm)
+static inline int irqchip_kernel(struct kvm *kvm)
{
- struct kvm_pic *vpic = pic_irqchip(kvm);
- bool ret;
+ return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL;
+}
- ret = (vpic != NULL);
- ret |= irqchip_split(kvm);
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE;
- /* Read vpic before kvm->irq_routing. */
+ /* Matches with wmb after initializing kvm->irq_routing. */
smp_rmb();
return ret;
}
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6c0191615f23..b96d3893f121 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -41,15 +41,6 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
bool line_status)
{
struct kvm_pic *pic = pic_irqchip(kvm);
-
- /*
- * XXX: rejecting pic routes when pic isn't in use would be better,
- * but the default routing table is installed while kvm->arch.vpic is
- * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE.
- */
- if (!pic)
- return -1;
-
return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
}
@@ -58,10 +49,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
bool line_status)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-
- if (!ioapic)
- return -1;
-
return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
line_status);
}
@@ -297,16 +284,20 @@ int kvm_set_routing_entry(struct kvm *kvm,
case KVM_IRQ_ROUTING_IRQCHIP:
delta = 0;
switch (ue->u.irqchip.irqchip) {
- case KVM_IRQCHIP_PIC_MASTER:
- e->set = kvm_set_pic_irq;
- max_pin = PIC_NUM_PINS;
- break;
case KVM_IRQCHIP_PIC_SLAVE:
+ delta = 8;
+ /* fall through */
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (!pic_in_kernel(kvm))
+ goto out;
+
e->set = kvm_set_pic_irq;
max_pin = PIC_NUM_PINS;
- delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:
+ if (!ioapic_in_kernel(kvm))
+ goto out;
+
max_pin = KVM_IOAPIC_NUM_PINS;
e->set = kvm_set_ioapic_irq;
break;
@@ -409,7 +400,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm)
void kvm_arch_post_irq_routing_update(struct kvm *kvm)
{
- if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+ if (!irqchip_split(kvm))
return;
kvm_make_scan_ioapic_request(kvm);
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2f6ef5121a4c..33b799fd3a6e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
+static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
+{
+ return apic->vcpu->vcpu_id;
+}
+
static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
switch (map->mode) {
@@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm)
struct kvm_apic_map *new, *old = NULL;
struct kvm_vcpu *vcpu;
int i;
- u32 max_id = 255;
+ u32 max_id = 255; /* enough space for any xAPIC ID */
mutex_lock(&kvm->arch.apic_map_lock);
kvm_for_each_vcpu(i, vcpu, kvm)
if (kvm_apic_present(vcpu))
- max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+ max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
@@ -179,16 +189,28 @@ static void recalculate_apic_map(struct kvm *kvm)
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_lapic **cluster;
u16 mask;
- u32 ldr, aid;
+ u32 ldr;
+ u8 xapic_id;
+ u32 x2apic_id;
if (!kvm_apic_present(vcpu))
continue;
- aid = kvm_apic_id(apic);
- ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+ xapic_id = kvm_xapic_id(apic);
+ x2apic_id = kvm_x2apic_id(apic);
- if (aid <= new->max_apic_id)
- new->phys_map[aid] = apic;
+ /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
+ if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
+ x2apic_id <= new->max_apic_id)
+ new->phys_map[x2apic_id] = apic;
+ /*
+ * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
+ * prevent them from masking VCPUs with APIC ID <= 0xff.
+ */
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+ new->phys_map[xapic_id] = apic;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
if (apic_x2apic_mode(apic)) {
new->mode |= KVM_APIC_MODE_X2APIC;
@@ -250,6 +272,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+ WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+
kvm_lapic_set_reg(apic, APIC_ID, id);
kvm_lapic_set_reg(apic, APIC_LDR, ldr);
recalculate_apic_map(apic->vcpu->kvm);
@@ -546,7 +570,15 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
-static void apic_update_ppr(struct kvm_lapic *apic)
+static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
+{
+ int highest_irr = apic_find_highest_irr(apic);
+ if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
+ return -1;
+ return highest_irr;
+}
+
+static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
{
u32 tpr, isrv, ppr, old_ppr;
int isr;
@@ -564,12 +596,27 @@ static void apic_update_ppr(struct kvm_lapic *apic)
apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
apic, ppr, isr, isrv);
- if (old_ppr != ppr) {
+ *new_ppr = ppr;
+ if (old_ppr != ppr)
kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
- if (ppr < old_ppr)
- kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
- }
+
+ return ppr < old_ppr;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+ u32 ppr;
+
+ if (__apic_update_ppr(apic, &ppr) &&
+ apic_has_interrupt_for_ppr(apic, ppr) != -1)
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
+{
+ apic_update_ppr(vcpu->arch.apic);
}
+EXPORT_SYMBOL_GPL(kvm_apic_update_ppr);
static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
{
@@ -579,10 +626,8 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
{
- if (apic_x2apic_mode(apic))
- return mda == X2APIC_BROADCAST;
-
- return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
+ return mda == (apic_x2apic_mode(apic) ?
+ X2APIC_BROADCAST : APIC_BROADCAST);
}
static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
@@ -591,9 +636,18 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
return true;
if (apic_x2apic_mode(apic))
- return mda == kvm_apic_id(apic);
+ return mda == kvm_x2apic_id(apic);
+
+ /*
+ * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if
+ * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and
+ * this allows unique addressing of VCPUs with APIC ID over 0xff.
+ * The 0xff condition is needed because writeable xAPIC ID.
+ */
+ if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
+ return true;
- return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
+ return mda == kvm_xapic_id(apic);
}
static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -610,7 +664,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
&& (logical_id & mda & 0xffff) != 0;
logical_id = GET_APIC_LOGICAL_ID(logical_id);
- mda = GET_APIC_DEST_FIELD(mda);
switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
case APIC_DFR_FLAT:
@@ -627,9 +680,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
/* The KVM local APIC implementation has two quirks:
*
- * - the xAPIC MDA stores the destination at bits 24-31, while this
- * is not true of struct kvm_lapic_irq's dest_id field. This is
- * just a quirk in the API and is not problematic.
+ * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
+ * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
+ * KVM doesn't do that aliasing.
*
* - in-kernel IOAPIC messages have to be delivered directly to
* x2APIC, because the kernel does not support interrupt remapping.
@@ -645,13 +698,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
struct kvm_lapic *source, struct kvm_lapic *target)
{
bool ipi = source != NULL;
- bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
- !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+ !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
return X2APIC_BROADCAST;
- return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
+ return dest_id;
}
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
@@ -1907,9 +1959,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.apic_arb_prio = 0;
vcpu->arch.apic_attention = 0;
- apic_debug("%s: vcpu=%p, id=%d, base_msr="
+ apic_debug("%s: vcpu=%p, id=0x%x, base_msr="
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
- vcpu, kvm_apic_id(apic),
+ vcpu, kvm_lapic_get_reg(apic, APIC_ID),
vcpu->arch.apic_base, apic->base_address);
}
@@ -2021,17 +2073,13 @@ nomem:
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- int highest_irr;
+ u32 ppr;
if (!apic_enabled(apic))
return -1;
- apic_update_ppr(apic);
- highest_irr = apic_find_highest_irr(apic);
- if ((highest_irr == -1) ||
- ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI)))
- return -1;
- return highest_irr;
+ __apic_update_ppr(apic, &ppr);
+ return apic_has_interrupt_for_ppr(apic, ppr);
}
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
@@ -2067,6 +2115,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
{
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 ppr;
if (vector == -1)
return -1;
@@ -2078,13 +2127,23 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
* because the process would deliver it through the IDT.
*/
- apic_set_isr(vector, apic);
- apic_update_ppr(apic);
apic_clear_irr(vector, apic);
-
if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
- apic_clear_isr(vector, apic);
+ /*
+ * For auto-EOI interrupts, there might be another pending
+ * interrupt above PPR, so check whether to raise another
+ * KVM_REQ_EVENT.
+ */
apic_update_ppr(apic);
+ } else {
+ /*
+ * For normal interrupts, PPR has been raised and there cannot
+ * be a higher-priority pending interrupt---except if there was
+ * a concurrent interrupt injection, but that would have
+ * triggered KVM_REQ_EVENT already.
+ */
+ apic_set_isr(vector, apic);
+ __apic_update_ppr(apic, &ppr);
}
return vector;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ff8039d61672..05abd837b78a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -73,6 +73,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
void __kvm_apic_update_irr(u32 *pir, void *regs);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -203,17 +204,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
-static inline u32 kvm_apic_id(struct kvm_lapic *apic)
-{
- /* To avoid a race between apic_base and following APIC_ID update when
- * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
- */
- if (apic_x2apic_mode(apic))
- return apic->vcpu->vcpu_id;
-
- return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
-}
-
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
void wait_lapic_expire(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7012de4a1fed..64821ca3a7c3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,8 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/kern_levels.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -129,6 +131,10 @@ module_param(dbg, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+/* The mask for the R/X bits in EPT PTEs */
+#define PT64_EPT_READABLE_MASK 0x1ull
+#define PT64_EPT_EXECUTABLE_MASK 0x4ull
+
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -178,6 +184,25 @@ static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
static u64 __read_mostly shadow_present_mask;
+/*
+ * The mask/value to distinguish a PTE that has been marked not-present for
+ * access tracking purposes.
+ * The mask would be either 0 if access tracking is disabled, or
+ * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
+ */
+static u64 __read_mostly shadow_acc_track_mask;
+static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page.
+ */
+static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
+ PT64_EPT_EXECUTABLE_MASK;
+static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
+
static void mmu_spte_set(u64 *sptep, u64 spte);
static void mmu_free_roots(struct kvm_vcpu *vcpu);
@@ -187,6 +212,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+static inline bool is_access_track_spte(u64 spte)
+{
+ /* Always false if shadow_acc_track_mask is zero. */
+ return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
+}
+
/*
* the low bit of the generation number is always presumed to be zero.
* This disables mmio caching during memslot updates. The concept is
@@ -284,7 +315,8 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
}
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+ u64 acc_track_mask)
{
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
@@ -292,9 +324,23 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
shadow_nx_mask = nx_mask;
shadow_x_mask = x_mask;
shadow_present_mask = p_mask;
+ shadow_acc_track_mask = acc_track_mask;
+ WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+void kvm_mmu_clear_all_pte_masks(void)
+{
+ shadow_user_mask = 0;
+ shadow_accessed_mask = 0;
+ shadow_dirty_mask = 0;
+ shadow_nx_mask = 0;
+ shadow_x_mask = 0;
+ shadow_mmio_mask = 0;
+ shadow_present_mask = 0;
+ shadow_acc_track_mask = 0;
+}
+
static int is_cpuid_PSE36(void)
{
return 1;
@@ -307,7 +353,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
static int is_shadow_present_pte(u64 pte)
{
- return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
+ return (pte != 0) && !is_mmio_spte(pte);
}
static int is_large_pte(u64 pte)
@@ -473,7 +519,7 @@ retry:
}
#endif
-static bool spte_is_locklessly_modifiable(u64 spte)
+static bool spte_can_locklessly_be_made_writable(u64 spte)
{
return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
(SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
@@ -481,36 +527,38 @@ static bool spte_is_locklessly_modifiable(u64 spte)
static bool spte_has_volatile_bits(u64 spte)
{
+ if (!is_shadow_present_pte(spte))
+ return false;
+
/*
* Always atomically update spte if it can be updated
* out of mmu-lock, it can ensure dirty bit is not lost,
* also, it can help us to get a stable is_writable_pte()
* to ensure tlb flush is not missed.
*/
- if (spte_is_locklessly_modifiable(spte))
+ if (spte_can_locklessly_be_made_writable(spte) ||
+ is_access_track_spte(spte))
return true;
- if (!shadow_accessed_mask)
- return false;
-
- if (!is_shadow_present_pte(spte))
- return false;
-
- if ((spte & shadow_accessed_mask) &&
- (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
- return false;
+ if (shadow_accessed_mask) {
+ if ((spte & shadow_accessed_mask) == 0 ||
+ (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
+ return true;
+ }
- return true;
+ return false;
}
-static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+static bool is_accessed_spte(u64 spte)
{
- return (old_spte & bit_mask) && !(new_spte & bit_mask);
+ return shadow_accessed_mask ? spte & shadow_accessed_mask
+ : !is_access_track_spte(spte);
}
-static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
+static bool is_dirty_spte(u64 spte)
{
- return (old_spte & bit_mask) != (new_spte & bit_mask);
+ return shadow_dirty_mask ? spte & shadow_dirty_mask
+ : spte & PT_WRITABLE_MASK;
}
/* Rules for using mmu_spte_set:
@@ -525,25 +573,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
__set_spte(sptep, new_spte);
}
-/* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changed.
- *
- * Whenever we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB, the return value indicates this
- * case.
+/*
+ * Update the SPTE (excluding the PFN), but do not track changes in its
+ * accessed/dirty status.
*/
-static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
{
u64 old_spte = *sptep;
- bool ret = false;
WARN_ON(!is_shadow_present_pte(new_spte));
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
- return ret;
+ return old_spte;
}
if (!spte_has_volatile_bits(old_spte))
@@ -551,45 +593,62 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
+ WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
+ return old_spte;
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
+ *
+ * Returns true if the TLB needs to be flushed
+ */
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+{
+ bool flush = false;
+ u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
+
+ if (!is_shadow_present_pte(old_spte))
+ return false;
+
/*
* For the spte updated out of mmu-lock is safe, since
* we always atomically update it, see the comments in
* spte_has_volatile_bits().
*/
- if (spte_is_locklessly_modifiable(old_spte) &&
+ if (spte_can_locklessly_be_made_writable(old_spte) &&
!is_writable_pte(new_spte))
- ret = true;
-
- if (!shadow_accessed_mask) {
- /*
- * We don't set page dirty when dropping non-writable spte.
- * So do it now if the new spte is becoming non-writable.
- */
- if (ret)
- kvm_set_pfn_dirty(spte_to_pfn(old_spte));
- return ret;
- }
+ flush = true;
/*
- * Flush TLB when accessed/dirty bits are changed in the page tables,
+ * Flush TLB when accessed/dirty states are changed in the page tables,
* to guarantee consistency between TLB and page tables.
*/
- if (spte_is_bit_changed(old_spte, new_spte,
- shadow_accessed_mask | shadow_dirty_mask))
- ret = true;
- if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+ if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
+ flush = true;
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
- if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+ }
+
+ if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
+ flush = true;
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+ }
- return ret;
+ return flush;
}
/*
* Rules for using mmu_spte_clear_track_bits:
* It sets the sptep from present to nonpresent, and track the
* state bits, it is used to clear the last level sptep.
+ * Returns non-zero if the PTE was previously valid.
*/
static int mmu_spte_clear_track_bits(u64 *sptep)
{
@@ -613,11 +672,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
*/
WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
- if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+ if (is_accessed_spte(old_spte))
kvm_set_pfn_accessed(pfn);
- if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask :
- PT_WRITABLE_MASK))
+
+ if (is_dirty_spte(old_spte))
kvm_set_pfn_dirty(pfn);
+
return 1;
}
@@ -636,6 +696,61 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
return __get_spte_lockless(sptep);
}
+static u64 mark_spte_for_access_track(u64 spte)
+{
+ if (shadow_accessed_mask != 0)
+ return spte & ~shadow_accessed_mask;
+
+ if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
+ return spte;
+
+ /*
+ * Verify that the write-protection that we do below will be fixable
+ * via the fast page fault path. Currently, that is always the case, at
+ * least when using EPT (which is when access tracking would be used).
+ */
+ WARN_ONCE((spte & PT_WRITABLE_MASK) &&
+ !spte_can_locklessly_be_made_writable(spte),
+ "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+
+ WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
+ shadow_acc_track_saved_bits_shift),
+ "kvm: Access Tracking saved bit locations are not zero\n");
+
+ spte |= (spte & shadow_acc_track_saved_bits_mask) <<
+ shadow_acc_track_saved_bits_shift;
+ spte &= ~shadow_acc_track_mask;
+ spte |= shadow_acc_track_value;
+
+ return spte;
+}
+
+/* Returns the Accessed status of the PTE and resets it at the same time. */
+static bool mmu_spte_age(u64 *sptep)
+{
+ u64 spte = mmu_spte_get_lockless(sptep);
+
+ if (!is_accessed_spte(spte))
+ return false;
+
+ if (shadow_accessed_mask) {
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ } else {
+ /*
+ * Capture the dirty status of the page, so that it doesn't get
+ * lost when the SPTE is marked for access tracking.
+ */
+ if (is_writable_pte(spte))
+ kvm_set_pfn_dirty(spte_to_pfn(spte));
+
+ spte = mark_spte_for_access_track(spte);
+ mmu_spte_update_no_track(sptep, spte);
+ }
+
+ return true;
+}
+
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
/*
@@ -1212,7 +1327,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
u64 spte = *sptep;
if (!is_writable_pte(spte) &&
- !(pt_protect && spte_is_locklessly_modifiable(spte)))
+ !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
return false;
rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
@@ -1420,7 +1535,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
- sptep, *sptep, gfn, level);
+ sptep, *sptep, gfn, level);
need_flush = 1;
@@ -1433,7 +1548,8 @@ restart:
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
- new_spte &= ~shadow_accessed_mask;
+
+ new_spte = mark_spte_for_access_track(new_spte);
mmu_spte_clear_track_bits(sptep);
mmu_spte_set(sptep, new_spte);
@@ -1595,15 +1711,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct rmap_iterator uninitialized_var(iter);
int young = 0;
- BUG_ON(!shadow_accessed_mask);
-
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- if (*sptep & shadow_accessed_mask) {
- young = 1;
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)sptep);
- }
- }
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ young |= mmu_spte_age(sptep);
trace_kvm_age_page(gfn, level, slot, young);
return young;
@@ -1615,24 +1724,20 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
{
u64 *sptep;
struct rmap_iterator iter;
- int young = 0;
/*
- * If there's no access bit in the secondary pte set by the
- * hardware it's up to gup-fast/gup to set the access bit in
- * the primary pte or in the page structure.
+ * If there's no access bit in the secondary pte set by the hardware and
+ * fast access tracking is also not enabled, it's up to gup-fast/gup to
+ * set the access bit in the primary pte or in the page structure.
*/
- if (!shadow_accessed_mask)
+ if (!shadow_accessed_mask && !shadow_acc_track_mask)
goto out;
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- if (*sptep & shadow_accessed_mask) {
- young = 1;
- break;
- }
- }
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ if (is_accessed_spte(*sptep))
+ return 1;
out:
- return young;
+ return 0;
}
#define RMAP_RECYCLE_THRESHOLD 1000
@@ -1660,7 +1765,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
* This has some overhead, but not as much as the cost of swapping
* out actively used pages or breaking up actively used hugepages.
*/
- if (!shadow_accessed_mask)
+ if (!shadow_accessed_mask && !shadow_acc_track_mask)
return kvm_handle_hva_range(kvm, start, end, 0,
kvm_unmap_rmapp);
@@ -1713,7 +1818,7 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
- return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+ return hash_64(gfn, KVM_MMU_HASH_SHIFT);
}
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
@@ -1904,17 +2009,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
* since it has been deleted from active_mmu_pages but still can be found
* at hast list.
*
- * for_each_gfn_valid_sp() has skipped that kind of pages.
+ * for_each_valid_sp() has skipped that kind of pages.
*/
-#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
+#define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
- if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \
- || (_sp)->role.invalid) {} else
+ if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
+ } else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
- for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
- if ((_sp)->role.direct) {} else
+ for_each_valid_sp(_kvm, _sp, _gfn) \
+ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
/* @sp->gfn should be write-protected at the call site */
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2116,6 +2221,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp;
bool need_sync = false;
bool flush = false;
+ int collisions = 0;
LIST_HEAD(invalid_list);
role = vcpu->arch.mmu.base_role;
@@ -2130,7 +2236,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) {
+ for_each_valid_sp(vcpu->kvm, sp, gfn) {
+ if (sp->gfn != gfn) {
+ collisions++;
+ continue;
+ }
+
if (!need_sync && sp->unsync)
need_sync = true;
@@ -2153,7 +2264,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
__clear_sp_write_flooding_count(sp);
trace_kvm_mmu_get_page(sp, false);
- return sp;
+ goto out;
}
++vcpu->kvm->stat.mmu_cache_miss;
@@ -2183,6 +2294,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
trace_kvm_mmu_get_page(sp, true);
kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+out:
+ if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
+ vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
return sp;
}
@@ -2583,6 +2697,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= shadow_dirty_mask;
}
+ if (speculative)
+ spte = mark_spte_for_access_track(spte);
+
set_pte:
if (mmu_spte_update(sptep, spte))
kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2636,7 +2753,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
is_large_pte(*sptep)? "2MB" : "4kB",
- *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
+ *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
*sptep, sptep);
if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;
@@ -2869,31 +2986,56 @@ static bool page_fault_can_be_fast(u32 error_code)
if (unlikely(error_code & PFERR_RSVD_MASK))
return false;
+ /* See if the page fault is due to an NX violation */
+ if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
+ == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
+ return false;
+
/*
- * #PF can be fast only if the shadow page table is present and it
- * is caused by write-protect, that means we just need change the
- * W bit of the spte which can be done out of mmu-lock.
+ * #PF can be fast if:
+ * 1. The shadow page table entry is not present, which could mean that
+ * the fault is potentially caused by access tracking (if enabled).
+ * 2. The shadow page table entry is present and the fault
+ * is caused by write-protect, that means we just need change the W
+ * bit of the spte which can be done out of mmu-lock.
+ *
+ * However, if access tracking is disabled we know that a non-present
+ * page must be a genuine page fault where we have to create a new SPTE.
+ * So, if access tracking is disabled, we return true only for write
+ * accesses to a present page.
*/
- if (!(error_code & PFERR_PRESENT_MASK) ||
- !(error_code & PFERR_WRITE_MASK))
- return false;
- return true;
+ return shadow_acc_track_mask != 0 ||
+ ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
+ == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
}
+/*
+ * Returns true if the SPTE was fixed successfully. Otherwise,
+ * someone else modified the SPTE from its original value.
+ */
static bool
fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *sptep, u64 spte)
+ u64 *sptep, u64 old_spte,
+ bool remove_write_prot, bool remove_acc_track)
{
gfn_t gfn;
+ u64 new_spte = old_spte;
WARN_ON(!sp->role.direct);
- /*
- * The gfn of direct spte is stable since it is calculated
- * by sp->gfn.
- */
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ if (remove_acc_track) {
+ u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift)
+ & shadow_acc_track_saved_bits_mask;
+
+ new_spte &= ~shadow_acc_track_mask;
+ new_spte &= ~(shadow_acc_track_saved_bits_mask <<
+ shadow_acc_track_saved_bits_shift);
+ new_spte |= saved_bits;
+ }
+
+ if (remove_write_prot)
+ new_spte |= PT_WRITABLE_MASK;
/*
* Theoretically we could also set dirty bit (and flush TLB) here in
@@ -2907,8 +3049,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
*
* Compare with set_spte where instead shadow_dirty_mask is set.
*/
- if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+ if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
+ return false;
+
+ if (remove_write_prot) {
+ /*
+ * The gfn of direct spte is stable since it is
+ * calculated by sp->gfn.
+ */
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
return true;
}
@@ -2923,8 +3074,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
- bool ret = false;
+ bool fault_handled = false;
u64 spte = 0ull;
+ uint retry_count = 0;
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return false;
@@ -2937,62 +3089,99 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
if (!is_shadow_present_pte(spte) || iterator.level < level)
break;
- /*
- * If the mapping has been changed, let the vcpu fault on the
- * same address again.
- */
- if (!is_shadow_present_pte(spte)) {
- ret = true;
- goto exit;
- }
+ do {
+ bool remove_write_prot = false;
+ bool remove_acc_track;
- sp = page_header(__pa(iterator.sptep));
- if (!is_last_spte(spte, sp->role.level))
- goto exit;
+ sp = page_header(__pa(iterator.sptep));
+ if (!is_last_spte(spte, sp->role.level))
+ break;
- /*
- * Check if it is a spurious fault caused by TLB lazily flushed.
- *
- * Need not check the access of upper level table entries since
- * they are always ACC_ALL.
- */
- if (is_writable_pte(spte)) {
- ret = true;
- goto exit;
- }
+ /*
+ * Check whether the memory access that caused the fault would
+ * still cause it if it were to be performed right now. If not,
+ * then this is a spurious fault caused by TLB lazily flushed,
+ * or some other CPU has already fixed the PTE after the
+ * current CPU took the fault.
+ *
+ * Need not check the access of upper level table entries since
+ * they are always ACC_ALL.
+ */
- /*
- * Currently, to simplify the code, only the spte write-protected
- * by dirty-log can be fast fixed.
- */
- if (!spte_is_locklessly_modifiable(spte))
- goto exit;
+ if (error_code & PFERR_FETCH_MASK) {
+ if ((spte & (shadow_x_mask | shadow_nx_mask))
+ == shadow_x_mask) {
+ fault_handled = true;
+ break;
+ }
+ } else if (error_code & PFERR_WRITE_MASK) {
+ if (is_writable_pte(spte)) {
+ fault_handled = true;
+ break;
+ }
- /*
- * Do not fix write-permission on the large spte since we only dirty
- * the first page into the dirty-bitmap in fast_pf_fix_direct_spte()
- * that means other pages are missed if its slot is dirty-logged.
- *
- * Instead, we let the slow page fault path create a normal spte to
- * fix the access.
- *
- * See the comments in kvm_arch_commit_memory_region().
- */
- if (sp->role.level > PT_PAGE_TABLE_LEVEL)
- goto exit;
+ /*
+ * Currently, to simplify the code, write-protection can
+ * be removed in the fast path only if the SPTE was
+ * write-protected for dirty-logging.
+ */
+ remove_write_prot =
+ spte_can_locklessly_be_made_writable(spte);
+ } else {
+ /* Fault was on Read access */
+ if (spte & PT_PRESENT_MASK) {
+ fault_handled = true;
+ break;
+ }
+ }
+
+ remove_acc_track = is_access_track_spte(spte);
+
+ /* Verify that the fault can be handled in the fast path */
+ if (!remove_acc_track && !remove_write_prot)
+ break;
+
+ /*
+ * Do not fix write-permission on the large spte since we only
+ * dirty the first page into the dirty-bitmap in
+ * fast_pf_fix_direct_spte() that means other pages are missed
+ * if its slot is dirty-logged.
+ *
+ * Instead, we let the slow page fault path create a normal spte
+ * to fix the access.
+ *
+ * See the comments in kvm_arch_commit_memory_region().
+ */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot)
+ break;
+
+ /*
+ * Currently, fast page fault only works for direct mapping
+ * since the gfn is not stable for indirect shadow page. See
+ * Documentation/virtual/kvm/locking.txt to get more detail.
+ */
+ fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
+ iterator.sptep, spte,
+ remove_write_prot,
+ remove_acc_track);
+ if (fault_handled)
+ break;
+
+ if (++retry_count > 4) {
+ printk_once(KERN_WARNING
+ "kvm: Fast #PF retrying more than 4 times.\n");
+ break;
+ }
+
+ spte = mmu_spte_get_lockless(iterator.sptep);
+
+ } while (true);
- /*
- * Currently, fast page fault only works for direct mapping since
- * the gfn is not stable for indirect shadow page.
- * See Documentation/virtual/kvm/locking.txt to get more detail.
- */
- ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
-exit:
trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
- spte, ret);
+ spte, fault_handled);
walk_shadow_page_lockless_end(vcpu);
- return ret;
+ return fault_handled;
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
@@ -5063,6 +5252,8 @@ static void mmu_destroy_caches(void)
int kvm_mmu_module_init(void)
{
+ kvm_mmu_clear_all_pte_masks();
+
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, 0, NULL);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 08a4d3ab3455..d0414f054bdf 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4182,6 +4182,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+ vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF);
+
if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a236decb81e4..4e691035a32d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5236,10 +5236,10 @@ static void ept_set_mmio_spte_mask(void)
/*
* EPT Misconfigurations can be generated if the value of bits 2:0
* of an EPT paging-structure entry is 110b (write/execute).
- * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
- * spte.
+ * Also, special bit (62) is set to quickly identify mmio spte.
*/
- kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
+ kvm_mmu_set_mmio_spte_mask(SPTE_SPECIAL_MASK |
+ VMX_EPT_MISCONFIG_WX_VALUE);
}
#define VMX_XSS_EXIT_BITMAP 0
@@ -6152,7 +6152,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_apic_update_ppr(vcpu);
return 1;
}
@@ -6374,14 +6374,20 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- /* it is a read fault? */
- error_code = (exit_qualification << 2) & PFERR_USER_MASK;
- /* it is a write fault? */
- error_code |= exit_qualification & PFERR_WRITE_MASK;
- /* It is a fetch fault? */
- error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
- /* ept page table is present? */
- error_code |= (exit_qualification & 0x38) != 0;
+ /* Is it a read fault? */
+ error_code = (exit_qualification & EPT_VIOLATION_READ)
+ ? PFERR_USER_MASK : 0;
+ /* Is it a write fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_WRITE)
+ ? PFERR_WRITE_MASK : 0;
+ /* Is it a fetch fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_INSTR)
+ ? PFERR_FETCH_MASK : 0;
+ /* ept page table entry is present? */
+ error_code |= (exit_qualification &
+ (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
+ EPT_VIOLATION_EXECUTABLE))
+ ? PFERR_PRESENT_MASK : 0;
vcpu->arch.exit_qualification = exit_qualification;
@@ -6572,6 +6578,19 @@ static void wakeup_handler(void)
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
}
+void vmx_enable_tdp(void)
+{
+ kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
+ enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
+ enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
+ 0ull, VMX_EPT_EXECUTABLE_MASK,
+ cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
+ enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK);
+
+ ept_set_mmio_spte_mask();
+ kvm_enable_tdp();
+}
+
static __init int hardware_setup(void)
{
int r = -ENOMEM, i, msr;
@@ -6697,16 +6716,9 @@ static __init int hardware_setup(void)
/* SELF-IPI */
vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
- if (enable_ept) {
- kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
- (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
- (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
- 0ull, VMX_EPT_EXECUTABLE_MASK,
- cpu_has_vmx_ept_execute_only() ?
- 0ull : VMX_EPT_READABLE_MASK);
- ept_set_mmio_spte_mask();
- kvm_enable_tdp();
- } else
+ if (enable_ept)
+ vmx_enable_tdp();
+ else
kvm_disable_tdp();
update_ple_window_actual_max();
@@ -7168,9 +7180,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
- if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
- return 1;
-
if (vmx->nested.vmxon) {
nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
return kvm_skip_emulated_instruction(vcpu);
@@ -7182,6 +7191,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
+ if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+ return 1;
+
if (cpu_has_vmx_msr_bitmap()) {
vmx->nested.msr_bitmap =
(unsigned long *)__get_free_page(GFP_KERNEL);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 57d8a856cdc5..6e2c71ea0627 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
{ "irq_injections", VCPU_STAT(irq_injections) },
{ "nmi_injections", VCPU_STAT(nmi_injections) },
+ { "req_event", VCPU_STAT(req_event) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -190,6 +191,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
+ { "max_mmu_page_hash_collisions",
+ VM_STAT(max_mmu_page_hash_collisions) },
{ NULL }
};
@@ -3896,7 +3899,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
goto split_irqchip_unlock;
/* Pairs with irqchip_in_kernel. */
smp_wmb();
- kvm->arch.irqchip_split = true;
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
r = 0;
split_irqchip_unlock:
@@ -3959,40 +3962,41 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break;
case KVM_CREATE_IRQCHIP: {
- struct kvm_pic *vpic;
-
mutex_lock(&kvm->lock);
+
r = -EEXIST;
- if (kvm->arch.vpic)
+ if (irqchip_in_kernel(kvm))
goto create_irqchip_unlock;
+
r = -EINVAL;
if (kvm->created_vcpus)
goto create_irqchip_unlock;
- r = -ENOMEM;
- vpic = kvm_create_pic(kvm);
- if (vpic) {
- r = kvm_ioapic_init(kvm);
- if (r) {
- mutex_lock(&kvm->slots_lock);
- kvm_destroy_pic(vpic);
- mutex_unlock(&kvm->slots_lock);
- goto create_irqchip_unlock;
- }
- } else
+
+ r = kvm_pic_init(kvm);
+ if (r)
+ goto create_irqchip_unlock;
+
+ r = kvm_ioapic_init(kvm);
+ if (r) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_pic_destroy(kvm);
+ mutex_unlock(&kvm->slots_lock);
goto create_irqchip_unlock;
+ }
+
r = kvm_setup_default_irq_routing(kvm);
if (r) {
mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->irq_lock);
kvm_ioapic_destroy(kvm);
- kvm_destroy_pic(vpic);
+ kvm_pic_destroy(kvm);
mutex_unlock(&kvm->irq_lock);
mutex_unlock(&kvm->slots_lock);
goto create_irqchip_unlock;
}
- /* Write kvm->irq_routing before kvm->arch.vpic. */
+ /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
smp_wmb();
- kvm->arch.vpic = vpic;
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
@@ -4028,7 +4032,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
+ if (!irqchip_kernel(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -4052,7 +4056,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
+ if (!irqchip_kernel(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r)
@@ -4461,6 +4465,21 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t gpa, bool write)
+{
+ /* For APIC access vmexit */
+ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ return 1;
+
+ if (vcpu_match_mmio_gpa(vcpu, gpa)) {
+ trace_vcpu_match_mmio(gva, gpa, write, true);
+ return 1;
+ }
+
+ return 0;
+}
+
static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t *gpa, struct x86_exception *exception,
bool write)
@@ -4487,16 +4506,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
if (*gpa == UNMAPPED_GVA)
return -1;
- /* For APIC access vmexit */
- if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- return 1;
-
- if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
- trace_vcpu_match_mmio(gva, *gpa, write, true);
- return 1;
- }
-
- return 0;
+ return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -4593,6 +4603,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
int handled, ret;
bool write = ops->write;
struct kvm_mmio_fragment *frag;
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+ /*
+ * If the exit was due to a NPF we may already have a GPA.
+ * If the GPA is present, use it to avoid the GVA to GPA table walk.
+ * Note, this cannot be used on string operations since string
+ * operation using rep will only have the initial GPA from the NPF
+ * occurred.
+ */
+ if (vcpu->arch.gpa_available &&
+ emulator_can_use_gpa(ctxt) &&
+ vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) &&
+ (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) {
+ gpa = exception->address;
+ goto mmio;
+ }
ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
@@ -5609,6 +5635,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
}
restart:
+ /* Save the faulting GPA (cr2) in the address field */
+ ctxt->exception.address = cr2;
+
r = x86_emulate_insn(ctxt);
if (r == EMULATION_INTERCEPTED)
@@ -6024,7 +6053,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0,
- PT_PRESENT_MASK);
+ PT_PRESENT_MASK, 0);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6731,6 +6760,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+ ++vcpu->stat.req_event;
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
r = 1;
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index cddd5d06e1cb..3603556fa0d9 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -280,6 +280,7 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */