diff options
-rw-r--r-- | Documentation/virtual/kvm/locking.txt | 31 | ||||
-rw-r--r-- | arch/powerpc/include/uapi/asm/kvm.h | 2 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv.c | 16 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rm_xics.c | 138 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_pr.c | 130 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_xics.c | 192 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_xics.h | 7 | ||||
-rw-r--r-- | arch/x86/include/asm/cpufeatures.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_emulate.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 25 | ||||
-rw-r--r-- | arch/x86/include/asm/vmx.h | 28 | ||||
-rw-r--r-- | arch/x86/kernel/fpu/xstate.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 20 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 16 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 19 | ||||
-rw-r--r-- | arch/x86/kvm/irq_comm.c | 29 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 135 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 12 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 489 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 62 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 92 | ||||
-rw-r--r-- | tools/arch/x86/include/asm/cpufeatures.h | 1 |
25 files changed, 951 insertions, 505 deletions
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index fd013bf4115b..1bb8bcaf8497 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -26,9 +26,16 @@ sections. Fast page fault: Fast page fault is the fast path which fixes the guest page fault out of -the mmu-lock on x86. Currently, the page fault can be fast only if the -shadow page table is present and it is caused by write-protect, that means -we just need change the W bit of the spte. +the mmu-lock on x86. Currently, the page fault can be fast in one of the +following two cases: + +1. Access Tracking: The SPTE is not present, but it is marked for access +tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to +restore the saved R/X bits. This is described in more detail later below. + +2. Write-Protection: The SPTE is present and the fault is +caused by write-protect. That means we just need to change the W bit of the +spte. What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and SPTE_MMU_WRITEABLE bit on the spte: @@ -38,7 +45,8 @@ SPTE_MMU_WRITEABLE bit on the spte: page write-protection. On fast page fault path, we will use cmpxchg to atomically set the spte W -bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this +bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or +restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This is safe because whenever changing these bits can be detected by cmpxchg. But we need carefully check these cases: @@ -142,6 +150,21 @@ Since the spte is "volatile" if it can be updated out of mmu-lock, we always atomically update the spte, the race caused by fast page fault can be avoided, See the comments in spte_has_volatile_bits() and mmu_spte_update(). +Lockless Access Tracking: + +This is used for Intel CPUs that are using EPT but do not support the EPT A/D +bits. In this case, when the KVM MMU notifier is called to track accesses to a +page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present +by clearing the RWX bits in the PTE and storing the original R & X bits in +some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the +PTE (using the ignored bit 62). When the VM tries to access the page later on, +a fault is generated and the fast page fault mechanism described above is used +to atomically restore the PTE to a Present state. The W bit is not saved when +the PTE is marked for access tracking and during restoration to the Present +state, the W bit is set depending on whether or not it was a write access. If +it wasn't, then the W bit will remain clear until a write access happens, at +which time it will be set using the Dirty tracking mechanism described above. + 3. Reference ------------ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index cc0908b6c2a0..4edbe4bb0e8b 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -633,5 +633,7 @@ struct kvm_ppc_rmmu_info { #define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40) #define KVM_XICS_MASKED (1ULL << 41) #define KVM_XICS_PENDING (1ULL << 42) +#define KVM_XICS_PRESENTED (1ULL << 43) +#define KVM_XICS_QUEUED (1ULL << 44) #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e4a79679342e..bdf281cc88c0 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -182,7 +182,8 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) ++vcpu->stat.halt_wakeup; } - if (kvmppc_ipi_thread(vcpu->arch.thread_cpu)) + cpu = READ_ONCE(vcpu->arch.thread_cpu); + if (cpu >= 0 && kvmppc_ipi_thread(cpu)) return; /* CPU points to the first thread of the core */ @@ -773,12 +774,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) } tvcpu->arch.prodded = 1; smp_mb(); - if (vcpu->arch.ceded) { - if (swait_active(&vcpu->wq)) { - swake_up(&vcpu->wq); - vcpu->stat.halt_wakeup++; - } - } + if (tvcpu->arch.ceded) + kvmppc_fast_vcpu_kick_hv(tvcpu); break; case H_CONFER: target = kvmppc_get_gpr(vcpu, 4); @@ -2665,7 +2662,8 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) int i; for_each_runnable_thread(i, vcpu, vc) { - if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) + if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded || + vcpu->arch.prodded) return 1; } @@ -2851,7 +2849,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) break; n_ceded = 0; for_each_runnable_thread(i, v, vc) { - if (!v->arch.pending_exceptions) + if (!v->arch.pending_exceptions && !v->arch.prodded) n_ceded += v->arch.ceded; else v->arch.ceded = 0; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 7e2eb3e865b3..0b2e388f4cdf 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -35,7 +35,7 @@ int kvm_irq_bypass = 1; EXPORT_SYMBOL(kvm_irq_bypass); static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq); + u32 new_irq, bool check_resend); static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu); /* -- ICS routines -- */ @@ -44,20 +44,12 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics, { int i; - arch_spin_lock(&ics->lock); - for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *state = &ics->irq_state[i]; - - if (!state->resend) - continue; - - arch_spin_unlock(&ics->lock); - icp_rm_deliver_irq(xics, icp, state->number); - arch_spin_lock(&ics->lock); + if (state->resend) + icp_rm_deliver_irq(xics, icp, state->number, true); } - arch_spin_unlock(&ics->lock); } /* -- ICP routines -- */ @@ -288,7 +280,7 @@ static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, } static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq) + u32 new_irq, bool check_resend) { struct ics_irq_state *state; struct kvmppc_ics *ics; @@ -333,6 +325,10 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, } } + if (check_resend) + if (!state->resend) + goto out; + /* Clear the resend bit of that interrupt */ state->resend = 0; @@ -378,7 +374,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ if (reject && reject != XICS_IPI) { arch_spin_unlock(&ics->lock); + icp->n_reject++; new_irq = reject; + check_resend = 0; goto again; } } else { @@ -386,10 +384,16 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * We failed to deliver the interrupt we need to set the * resend map bit and mark the ICS state as needing a resend */ - set_bit(ics->icsid, icp->resend_map); state->resend = 1; /* + * Make sure when checking resend, we don't miss the resend + * if resend_map bit is seen and cleared. + */ + smp_wmb(); + set_bit(ics->icsid, icp->resend_map); + + /* * If the need_resend flag got cleared in the ICP some time * between icp_rm_try_to_deliver() atomic update and now, then * we know it might have missed the resend_map bit. So we @@ -397,7 +401,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ smp_mb(); if (!icp->state.need_resend) { + state->resend = 0; arch_spin_unlock(&ics->lock); + check_resend = 0; goto again; } } @@ -592,7 +598,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, /* Handle reject in real mode */ if (reject && reject != XICS_IPI) { this_icp->n_reject++; - icp_rm_deliver_irq(xics, icp, reject); + icp_rm_deliver_irq(xics, icp, reject, false); } /* Handle resends in real mode */ @@ -660,59 +666,45 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) */ if (reject && reject != XICS_IPI) { icp->n_reject++; - icp_rm_deliver_irq(xics, icp, reject); + icp_rm_deliver_irq(xics, icp, reject, false); } bail: return check_too_hard(xics, icp); } -int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq) { struct kvmppc_xics *xics = vcpu->kvm->arch.xics; struct kvmppc_icp *icp = vcpu->arch.icp; struct kvmppc_ics *ics; struct ics_irq_state *state; - u32 irq = xirr & 0x00ffffff; u16 src; - - if (!xics || !xics->real_mode) - return H_TOO_HARD; + u32 pq_old, pq_new; /* - * ICP State: EOI + * ICS EOI handling: For LSI, if P bit is still set, we need to + * resend it. * - * Note: If EOI is incorrectly used by SW to lower the CPPR - * value (ie more favored), we do not check for rejection of - * a pending interrupt, this is a SW error and PAPR sepcifies - * that we don't have to deal with it. - * - * The sending of an EOI to the ICS is handled after the - * CPPR update - * - * ICP State: Down_CPPR which we handle - * in a separate function as it's shared with H_CPPR. + * For MSI, we move Q bit into P (and clear Q). If it is set, + * resend it. */ - icp_rm_down_cppr(xics, icp, xirr >> 24); - /* IPIs have no EOI */ - if (irq == XICS_IPI) - goto bail; - /* - * EOI handling: If the interrupt is still asserted, we need to - * resend it. We can take a lockless "peek" at the ICS state here. - * - * "Message" interrupts will never have "asserted" set - */ ics = kvmppc_xics_find_ics(xics, irq, &src); if (!ics) goto bail; + state = &ics->irq_state[src]; - /* Still asserted, resend it */ - if (state->asserted) { - icp->n_reject++; - icp_rm_deliver_irq(xics, icp, irq); - } + if (state->lsi) + pq_new = state->pq_state; + else + do { + pq_old = state->pq_state; + pq_new = pq_old >> 1; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + if (pq_new & PQ_PRESENTED) + icp_rm_deliver_irq(xics, NULL, irq, false); if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { icp->rm_action |= XICS_RM_NOTIFY_EOI; @@ -733,10 +725,43 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) state->intr_cpu = -1; } } + bail: return check_too_hard(xics, icp); } +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 irq = xirr & 0x00ffffff; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR specifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_rm_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + return check_too_hard(xics, icp); + + return ics_rm_eoi(vcpu, irq); +} + unsigned long eoi_rc; static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) @@ -823,14 +848,33 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, { struct kvmppc_xics *xics; struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; u32 irq; + u16 src; + u32 pq_old, pq_new; irq = irq_map->v_hwirq; xics = vcpu->kvm->arch.xics; icp = vcpu->arch.icp; kvmppc_rm_handle_irq_desc(irq_map->desc); - icp_rm_deliver_irq(xics, icp, irq); + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return 2; + + state = &ics->irq_state[src]; + + /* only MSIs register bypass producers, so it must be MSI here */ + do { + pq_old = state->pq_state; + pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + /* Test P=1, Q=0, this is the only case where we present */ + if (pq_new == PQ_PRESENTED) + icp_rm_deliver_irq(xics, icp, irq, false); /* EOI the interrupt */ icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 1482961ceb4d..d4dfc0ca2a44 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -902,6 +902,69 @@ static void kvmppc_clear_debug(struct kvm_vcpu *vcpu) } } +static int kvmppc_exit_pr_progint(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) +{ + enum emulation_result er; + ulong flags; + u32 last_inst; + int emul, r; + + /* + * shadow_srr1 only contains valid flags if we came here via a program + * exception. The other exceptions (emulation assist, FP unavailable, + * etc.) do not provide flags in SRR1, so use an illegal-instruction + * exception when injecting a program interrupt into the guest. + */ + if (exit_nr == BOOK3S_INTERRUPT_PROGRAM) + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + else + flags = SRR1_PROGILL; + + emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); + if (emul != EMULATE_DONE) + return RESUME_GUEST; + + if (kvmppc_get_msr(vcpu) & MSR_PR) { +#ifdef EXIT_DEBUG + pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n", + kvmppc_get_pc(vcpu), last_inst); +#endif + if ((last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) { + kvmppc_core_queue_program(vcpu, flags); + return RESUME_GUEST; + } + } + + vcpu->stat.emulated_inst_exits++; + er = kvmppc_emulate_instruction(run, vcpu); + switch (er) { + case EMULATE_DONE: + r = RESUME_GUEST_NV; + break; + case EMULATE_AGAIN: + r = RESUME_GUEST; + break; + case EMULATE_FAIL: + pr_crit("%s: emulation at %lx failed (%08x)\n", + __func__, kvmppc_get_pc(vcpu), last_inst); + kvmppc_core_queue_program(vcpu, flags); + r = RESUME_GUEST; + break; + case EMULATE_DO_MMIO: + run->exit_reason = KVM_EXIT_MMIO; + r = RESUME_HOST_NV; + break; + case EMULATE_EXIT_USER: + r = RESUME_HOST_NV; + break; + default: + BUG(); + } + + return r; +} + int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { @@ -1044,71 +1107,8 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOK3S_INTERRUPT_PROGRAM: case BOOK3S_INTERRUPT_H_EMUL_ASSIST: - { - enum emulation_result er; - ulong flags; - u32 last_inst; - int emul; - -program_interrupt: - /* - * shadow_srr1 only contains valid flags if we came here via - * a program exception. The other exceptions (emulation assist, - * FP unavailable, etc.) do not provide flags in SRR1, so use - * an illegal-instruction exception when injecting a program - * interrupt into the guest. - */ - if (exit_nr == BOOK3S_INTERRUPT_PROGRAM) - flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; - else - flags = SRR1_PROGILL; - - emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); - if (emul != EMULATE_DONE) { - r = RESUME_GUEST; - break; - } - - if (kvmppc_get_msr(vcpu) & MSR_PR) { -#ifdef EXIT_DEBUG - pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n", - kvmppc_get_pc(vcpu), last_inst); -#endif - if ((last_inst & 0xff0007ff) != - (INS_DCBZ & 0xfffffff7)) { - kvmppc_core_queue_program(vcpu, flags); - r = RESUME_GUEST; - break; - } - } - - vcpu->stat.emulated_inst_exits++; - er = kvmppc_emulate_instruction(run, vcpu); - switch (er) { - case EMULATE_DONE: - r = RESUME_GUEST_NV; - break; - case EMULATE_AGAIN: - r = RESUME_GUEST; - break; - case EMULATE_FAIL: - printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", - __func__, kvmppc_get_pc(vcpu), last_inst); - kvmppc_core_queue_program(vcpu, flags); - r = RESUME_GUEST; - break; - case EMULATE_DO_MMIO: - run->exit_reason = KVM_EXIT_MMIO; - r = RESUME_HOST_NV; - break; - case EMULATE_EXIT_USER: - r = RESUME_HOST_NV; - break; - default: - BUG(); - } + r = kvmppc_exit_pr_progint(run, vcpu, exit_nr); break; - } case BOOK3S_INTERRUPT_SYSCALL: { u32 last_sc; @@ -1185,7 +1185,7 @@ program_interrupt: emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); if (emul == EMULATE_DONE) - goto program_interrupt; + r = kvmppc_exit_pr_progint(run, vcpu, exit_nr); else r = RESUME_GUEST; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 20dff102a06f..e48803e2918d 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -63,7 +63,7 @@ /* -- ICS routines -- */ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq); + u32 new_irq, bool check_resend); /* * Return value ideally indicates how the interrupt was handled, but no @@ -75,6 +75,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) struct ics_irq_state *state; struct kvmppc_ics *ics; u16 src; + u32 pq_old, pq_new; XICS_DBG("ics deliver %#x (level: %d)\n", irq, level); @@ -87,25 +88,41 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) if (!state->exists) return -EINVAL; + if (level == KVM_INTERRUPT_SET_LEVEL || level == KVM_INTERRUPT_SET) + level = 1; + else if (level == KVM_INTERRUPT_UNSET) + level = 0; /* - * We set state->asserted locklessly. This should be fine as - * we are the only setter, thus concurrent access is undefined - * to begin with. + * Take other values the same as 1, consistent with original code. + * maybe WARN here? */ - if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL) - state->asserted = 1; - else if (level == 0 || level == KVM_INTERRUPT_UNSET) { - state->asserted = 0; + + if (!state->lsi && level == 0) /* noop for MSI */ return 0; - } + + do { + pq_old = state->pq_state; + if (state->lsi) { + if (level) { + if (pq_old & PQ_PRESENTED) + /* Setting already set LSI ... */ + return 0; + + pq_new = PQ_PRESENTED; + } else + pq_new = 0; + } else + pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + /* Test P=1, Q=0, this is the only case where we present */ + if (pq_new == PQ_PRESENTED) + icp_deliver_irq(xics, NULL, irq, false); /* Record which CPU this arrived on for passed-through interrupts */ if (state->host_irq) state->intr_cpu = raw_smp_processor_id(); - /* Attempt delivery */ - icp_deliver_irq(xics, NULL, irq); - return 0; } @@ -114,29 +131,14 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, { int i; - unsigned long flags; - - local_irq_save(flags); - arch_spin_lock(&ics->lock); - for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *state = &ics->irq_state[i]; - - if (!state->resend) - continue; - - XICS_DBG("resend %#x prio %#x\n", state->number, - state->priority); - - arch_spin_unlock(&ics->lock); - local_irq_restore(flags); - icp_deliver_irq(xics, icp, state->number); - local_irq_save(flags); - arch_spin_lock(&ics->lock); + if (state->resend) { + XICS_DBG("resend %#x prio %#x\n", state->number, + state->priority); + icp_deliver_irq(xics, icp, state->number, true); + } } - - arch_spin_unlock(&ics->lock); - local_irq_restore(flags); } static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, @@ -155,6 +157,7 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, deliver = false; if ((state->masked_pending || state->resend) && priority != MASKED) { state->masked_pending = 0; + state->resend = 0; deliver = true; } @@ -189,7 +192,7 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) state->masked_pending, state->resend); if (write_xive(xics, ics, state, server, priority, priority)) - icp_deliver_irq(xics, icp, irq); + icp_deliver_irq(xics, icp, irq, false); return 0; } @@ -242,7 +245,7 @@ int kvmppc_xics_int_on(struct kvm *kvm, u32 irq) if (write_xive(xics, ics, state, state->server, state->saved_priority, state->saved_priority)) - icp_deliver_irq(xics, icp, irq); + icp_deliver_irq(xics, icp, irq, false); return 0; } @@ -376,7 +379,7 @@ static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, } static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq) + u32 new_irq, bool check_resend) { struct ics_irq_state *state; struct kvmppc_ics *ics; @@ -422,6 +425,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, } } + if (check_resend) + if (!state->resend) + goto out; + /* Clear the resend bit of that interrupt */ state->resend = 0; @@ -470,6 +477,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, arch_spin_unlock(&ics->lock); local_irq_restore(flags); new_irq = reject; + check_resend = 0; goto again; } } else { @@ -477,10 +485,16 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * We failed to deliver the interrupt we need to set the * resend map bit and mark the ICS state as needing a resend */ - set_bit(ics->icsid, icp->resend_map); state->resend = 1; /* + * Make sure when checking resend, we don't miss the resend + * if resend_map bit is seen and cleared. + */ + smp_wmb(); + set_bit(ics->icsid, icp->resend_map); + + /* * If the need_resend flag got cleared in the ICP some time * between icp_try_to_deliver() atomic update and now, then * we know it might have missed the resend_map bit. So we @@ -488,8 +502,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ smp_mb(); if (!icp->state.need_resend) { + state->resend = 0; arch_spin_unlock(&ics->lock); local_irq_restore(flags); + check_resend = 0; goto again; } } @@ -681,7 +697,7 @@ static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, /* Handle reject */ if (reject && reject != XICS_IPI) - icp_deliver_irq(xics, icp, reject); + icp_deliver_irq(xics, icp, reject, false); /* Handle resend */ if (resend) @@ -761,17 +777,54 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) * attempt (see comments in icp_deliver_irq). */ if (reject && reject != XICS_IPI) - icp_deliver_irq(xics, icp, reject); + icp_deliver_irq(xics, icp, reject, false); } -static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq) { struct kvmppc_xics *xics = vcpu->kvm->arch.xics; struct kvmppc_icp *icp = vcpu->arch.icp; struct kvmppc_ics *ics; struct ics_irq_state *state; - u32 irq = xirr & 0x00ffffff; u16 src; + u32 pq_old, pq_new; + + /* + * ICS EOI handling: For LSI, if P bit is still set, we need to + * resend it. + * + * For MSI, we move Q bit into P (and clear Q). If it is set, + * resend it. + */ + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("ios_eoi: IRQ 0x%06x not found !\n", irq); + return H_PARAMETER; + } + state = &ics->irq_state[src]; + + if (state->lsi) + pq_new = state->pq_state; + else + do { + pq_old = state->pq_state; + pq_new = pq_old >> 1; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + if (pq_new & PQ_PRESENTED) + icp_deliver_irq(xics, icp, irq, false); + + kvm_notify_acked_irq(vcpu->kvm, 0, irq); + + return H_SUCCESS; +} + +static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 irq = xirr & 0x00ffffff; XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr); @@ -794,26 +847,8 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) /* IPIs have no EOI */ if (irq == XICS_IPI) return H_SUCCESS; - /* - * EOI handling: If the interrupt is still asserted, we need to - * resend it. We can take a lockless "peek" at the ICS state here. - * - * "Message" interrupts will never have "asserted" set - */ - ics = kvmppc_xics_find_ics(xics, irq, &src); - if (!ics) { - XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq); - return H_PARAMETER; - } - state = &ics->irq_state[src]; - /* Still asserted, resend it */ - if (state->asserted) - icp_deliver_irq(xics, icp, irq); - - kvm_notify_acked_irq(vcpu->kvm, 0, irq); - - return H_SUCCESS; + return ics_eoi(vcpu, irq); } int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) @@ -832,10 +867,6 @@ int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) icp->n_rm_check_resend++; icp_check_resend(xics, icp->rm_resend_icp); } - if (icp->rm_action & XICS_RM_REJECT) { - icp->n_rm_reject++; - icp_deliver_irq(xics, icp, icp->rm_reject); - } if (icp->rm_action & XICS_RM_NOTIFY_EOI) { icp->n_rm_notify_eoi++; kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq); @@ -920,7 +951,7 @@ static int xics_debug_show(struct seq_file *m, void *private) int icsid, i; unsigned long flags; unsigned long t_rm_kick_vcpu, t_rm_check_resend; - unsigned long t_rm_reject, t_rm_notify_eoi; + unsigned long t_rm_notify_eoi; unsigned long t_reject, t_check_resend; if (!kvm) @@ -929,7 +960,6 @@ static int xics_debug_show(struct seq_file *m, void *private) t_rm_kick_vcpu = 0; t_rm_notify_eoi = 0; t_rm_check_resend = 0; - t_rm_reject = 0; t_check_resend = 0; t_reject = 0; @@ -952,14 +982,13 @@ static int xics_debug_show(struct seq_file *m, void *private) t_rm_kick_vcpu += icp->n_rm_kick_vcpu; t_rm_notify_eoi += icp->n_rm_notify_eoi; t_rm_check_resend += icp->n_rm_check_resend; - t_rm_reject += icp->n_rm_reject; t_check_resend += icp->n_check_resend; t_reject += icp->n_reject; } - seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", + seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu notify_eoi=%lu\n", t_rm_kick_vcpu, t_rm_check_resend, - t_rm_reject, t_rm_notify_eoi); + t_rm_notify_eoi); seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n", t_check_resend, t_reject); for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { @@ -977,9 +1006,9 @@ static int xics_debug_show(struct seq_file *m, void *private) for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *irq = &ics->irq_state[i]; - seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n", + seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x pq_state %d resend %d masked pending %d\n", irq->number, irq->server, irq->priority, - irq->saved_priority, irq->asserted, + irq->saved_priority, irq->pq_state, irq->resend, irq->masked_pending); } @@ -1198,10 +1227,17 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) val |= prio << KVM_XICS_PRIORITY_SHIFT; if (irqp->lsi) { val |= KVM_XICS_LEVEL_SENSITIVE; - if (irqp->asserted) + if (irqp->pq_state & PQ_PRESENTED) val |= KVM_XICS_PENDING; } else if (irqp->masked_pending || irqp->resend) val |= KVM_XICS_PENDING; + + if (irqp->pq_state & PQ_PRESENTED) + val |= KVM_XICS_PRESENTED; + + if (irqp->pq_state & PQ_QUEUED) + val |= KVM_XICS_QUEUED; + ret = 0; } arch_spin_unlock(&ics->lock); @@ -1253,18 +1289,20 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) irqp->resend = 0; irqp->masked_pending = 0; irqp->lsi = 0; - irqp->asserted = 0; - if (val & KVM_XICS_LEVEL_SENSITIVE) { + irqp->pq_state = 0; + if (val & KVM_XICS_LEVEL_SENSITIVE) irqp->lsi = 1; - if (val & KVM_XICS_PENDING) - irqp->asserted = 1; - } + /* If PENDING, set P in case P is not saved because of old code */ + if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING) + irqp->pq_state |= PQ_PRESENTED; + if (val & KVM_XICS_QUEUED) + irqp->pq_state |= PQ_QUEUED; irqp->exists = 1; arch_spin_unlock(&ics->lock); local_irq_restore(flags); if (val & KVM_XICS_PENDING) - icp_deliver_irq(xics, NULL, irqp->number); + icp_deliver_irq(xics, NULL, irqp->number, false); return 0; } diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 2a50320b55ca..ec5474cf70c6 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -31,16 +31,19 @@ /* Priority value to use for disabling an interrupt */ #define MASKED 0xff +#define PQ_PRESENTED 1 +#define PQ_QUEUED 2 + /* State for one irq source */ struct ics_irq_state { u32 number; u32 server; + u32 pq_state; u8 priority; u8 saved_priority; u8 resend; u8 masked_pending; u8 lsi; /* level-sensitive interrupt */ - u8 asserted; /* Only for LSI */ u8 exists; int intr_cpu; u32 host_irq; @@ -73,7 +76,6 @@ struct kvmppc_icp { */ #define XICS_RM_KICK_VCPU 0x1 #define XICS_RM_CHECK_RESEND 0x2 -#define XICS_RM_REJECT 0x4 #define XICS_RM_NOTIFY_EOI 0x8 u32 rm_action; struct kvm_vcpu *rm_kick_target; @@ -84,7 +86,6 @@ struct kvmppc_icp { /* Counters for each reason we exited real mode */ unsigned long n_rm_kick_vcpu; unsigned long n_rm_check_resend; - unsigned long n_rm_reject; unsigned long n_rm_notify_eoi; /* Counters for handling ICP processing in real mode */ unsigned long n_check_resend; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index eafee3161d1c..d9d7136edf05 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -288,6 +288,7 @@ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ @@ -320,5 +321,4 @@ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ - #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e9cd7befcb76..3e8c287090e4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -441,5 +441,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); +bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a7066dc1a7e9..417502cf42b6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -115,7 +115,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 -#define KVM_MMU_HASH_SHIFT 10 +#define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 @@ -208,6 +208,13 @@ enum { PFERR_WRITE_MASK | \ PFERR_PRESENT_MASK) +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting + * with the SVE bit in EPT PTEs. + */ +#define SPTE_SPECIAL_MASK (1ULL << 62) + /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* @@ -668,6 +675,9 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; + + /* GPA available (AMD only) */ + bool gpa_available; }; struct kvm_lpage_info { @@ -716,6 +726,12 @@ struct kvm_hv { HV_REFERENCE_TSC_PAGE tsc_ref; }; +enum kvm_irqchip_mode { + KVM_IRQCHIP_NONE, + KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ + KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -788,7 +804,7 @@ struct kvm_arch { u64 disabled_quirks; - bool irqchip_split; + enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; bool disabled_lapic_found; @@ -815,6 +831,7 @@ struct kvm_vm_stat { ulong mmu_unsync; ulong remote_tlb_flush; ulong lpages; + ulong max_mmu_page_hash_collisions; }; struct kvm_vcpu_stat { @@ -844,6 +861,7 @@ struct kvm_vcpu_stat { u64 hypercalls; u64 irq_injections; u64 nmi_injections; + u64 req_event; }; struct x86_instruction_info; @@ -1050,7 +1068,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2b5b2d4b924e..a22a4790f1ac 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -467,8 +467,16 @@ enum vmcs_field { #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IPAT_BIT (1ull << 6) -#define VMX_EPT_ACCESS_BIT (1ull << 8) -#define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_ACCESS_BIT (1ull << 8) +#define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ + VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) +#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT) + +/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ +#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul @@ -500,6 +508,22 @@ struct vmx_msr_entry { #define ENTRY_FAIL_VMCS_LINK_PTR 4 /* + * Exit Qualifications for EPT Violations + */ +#define EPT_VIOLATION_READ_BIT 0 +#define EPT_VIOLATION_WRITE_BIT 1 +#define EPT_VIOLATION_INSTR_BIT 2 +#define EPT_VIOLATION_READABLE_BIT 3 +#define EPT_VIOLATION_WRITABLE_BIT 4 +#define EPT_VIOLATION_EXECUTABLE_BIT 5 +#define EPT_VIOLATION_READ (1 << EPT_VIOLATION_READ_BIT) +#define EPT_VIOLATION_WRITE (1 << EPT_VIOLATION_WRITE_BIT) +#define EPT_VIOLATION_INSTR (1 << EPT_VIOLATION_INSTR_BIT) +#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) +#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) +#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) + +/* * VM-instruction error numbers */ enum vm_instruction_error_number { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1d7770447b3e..35f7024aace5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -78,6 +78,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_PKU); setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); + setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e85f6bd7b9d5..09c2ac741567 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -383,7 +383,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/; + F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index cedbba0f3402..45c7306c8780 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -173,6 +173,7 @@ #define NearBranch ((u64)1 << 52) /* Near branches */ #define No16 ((u64)1 << 53) /* No 16 bit operand */ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ +#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -4298,7 +4299,7 @@ static const struct opcode group1[] = { }; static const struct opcode group1A[] = { - I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N, }; static const struct opcode group2[] = { @@ -4336,7 +4337,7 @@ static const struct opcode group5[] = { I(SrcMemFAddr | ImplicitOps, em_call_far), I(SrcMem | NearBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps, em_jmp_far), - I(SrcMem | Stack, em_push), D(Undefined), + I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), }; static const struct opcode group6[] = { @@ -4556,8 +4557,8 @@ static const struct opcode opcode_table[256] = { /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), - I2bv(SrcSI | DstDI | Mov | String, em_mov), - F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r), + I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov), + F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r), /* 0xA8 - 0xAF */ F2bv(DstAcc | SrcImm | NoWrite, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), @@ -5671,3 +5672,14 @@ void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt) { writeback_registers(ctxt); } + +bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->rep_prefix && (ctxt->d & String)) + return false; + + if (ctxt->d & TwoMemOp) + return false; + + return true; +} diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 1572c35b4f1a..08b27e0c7b71 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -305,13 +305,13 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return -ENOENT; memset(&irq, 0, sizeof(irq)); - irq.dest_id = kvm_apic_id(vcpu->arch.apic); + irq.shorthand = APIC_DEST_SELF; irq.dest_mode = APIC_DEST_PHYSICAL; irq.delivery_mode = APIC_DM_FIXED; irq.vector = vector; irq.level = 1; - ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL); + ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL); trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); return ret; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 7cc2360f1848..73ea24d4f119 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -598,14 +598,14 @@ static const struct kvm_io_device_ops picdev_eclr_ops = { .write = picdev_eclr_write, }; -struct kvm_pic *kvm_create_pic(struct kvm *kvm) +int kvm_pic_init(struct kvm *kvm) { struct kvm_pic *s; int ret; s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) - return NULL; + return -ENOMEM; spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; @@ -635,7 +635,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); - return s; + kvm->arch.vpic = s; + + return 0; fail_unreg_1: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); @@ -648,13 +650,17 @@ fail_unlock: kfree(s); - return NULL; + return ret; } -void kvm_destroy_pic(struct kvm_pic *vpic) +void kvm_pic_destroy(struct kvm *kvm) { + struct kvm_pic *vpic = kvm->arch.vpic; + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + + kvm->arch.vpic = NULL; kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 035731eb3897..40d5b2cf6061 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -73,8 +73,8 @@ struct kvm_pic { unsigned long irq_states[PIC_NUM_PINS]; }; -struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_destroy_pic(struct kvm_pic *vpic); +int kvm_pic_init(struct kvm *kvm); +void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); @@ -93,18 +93,19 @@ static inline int pic_in_kernel(struct kvm *kvm) static inline int irqchip_split(struct kvm *kvm) { - return kvm->arch.irqchip_split; + return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT; } -static inline int irqchip_in_kernel(struct kvm *kvm) +static inline int irqchip_kernel(struct kvm *kvm) { - struct kvm_pic *vpic = pic_irqchip(kvm); - bool ret; + return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL; +} - ret = (vpic != NULL); - ret |= irqchip_split(kvm); +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE; - /* Read vpic before kvm->irq_routing. */ + /* Matches with wmb after initializing kvm->irq_routing. */ smp_rmb(); return ret; } diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6c0191615f23..b96d3893f121 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -41,15 +41,6 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_pic *pic = pic_irqchip(kvm); - - /* - * XXX: rejecting pic routes when pic isn't in use would be better, - * but the default routing table is installed while kvm->arch.vpic is - * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE. - */ - if (!pic) - return -1; - return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); } @@ -58,10 +49,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - - if (!ioapic) - return -1; - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, line_status); } @@ -297,16 +284,20 @@ int kvm_set_routing_entry(struct kvm *kvm, case KVM_IRQ_ROUTING_IRQCHIP: delta = 0; switch (ue->u.irqchip.irqchip) { - case KVM_IRQCHIP_PIC_MASTER: - e->set = kvm_set_pic_irq; - max_pin = PIC_NUM_PINS; - break; case KVM_IRQCHIP_PIC_SLAVE: + delta = 8; + /* fall through */ + case KVM_IRQCHIP_PIC_MASTER: + if (!pic_in_kernel(kvm)) + goto out; + e->set = kvm_set_pic_irq; max_pin = PIC_NUM_PINS; - delta = 8; break; case KVM_IRQCHIP_IOAPIC: + if (!ioapic_in_kernel(kvm)) + goto out; + max_pin = KVM_IOAPIC_NUM_PINS; e->set = kvm_set_ioapic_irq; break; @@ -409,7 +400,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm) void kvm_arch_post_irq_routing_update(struct kvm *kvm) { - if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) + if (!irqchip_split(kvm)) return; kvm_make_scan_ioapic_request(kvm); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2f6ef5121a4c..33b799fd3a6e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) +static inline u8 kvm_xapic_id(struct kvm_lapic *apic) +{ + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; +} + +static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) +{ + return apic->vcpu->vcpu_id; +} + static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->mode) { @@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; int i; - u32 max_id = 255; + u32 max_id = 255; /* enough space for any xAPIC ID */ mutex_lock(&kvm->arch.apic_map_lock); kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) - max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); + max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); @@ -179,16 +189,28 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_lapic **cluster; u16 mask; - u32 ldr, aid; + u32 ldr; + u8 xapic_id; + u32 x2apic_id; if (!kvm_apic_present(vcpu)) continue; - aid = kvm_apic_id(apic); - ldr = kvm_lapic_get_reg(apic, APIC_LDR); + xapic_id = kvm_xapic_id(apic); + x2apic_id = kvm_x2apic_id(apic); - if (aid <= new->max_apic_id) - new->phys_map[aid] = apic; + /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */ + if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && + x2apic_id <= new->max_apic_id) + new->phys_map[x2apic_id] = apic; + /* + * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around, + * prevent them from masking VCPUs with APIC ID <= 0xff. + */ + if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) + new->phys_map[xapic_id] = apic; + + ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) { new->mode |= KVM_APIC_MODE_X2APIC; @@ -250,6 +272,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + WARN_ON_ONCE(id != apic->vcpu->vcpu_id); + kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); @@ -546,7 +570,15 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -static void apic_update_ppr(struct kvm_lapic *apic) +static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) +{ + int highest_irr = apic_find_highest_irr(apic); + if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) + return -1; + return highest_irr; +} + +static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr) { u32 tpr, isrv, ppr, old_ppr; int isr; @@ -564,12 +596,27 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", apic, ppr, isr, isrv); - if (old_ppr != ppr) { + *new_ppr = ppr; + if (old_ppr != ppr) kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); - if (ppr < old_ppr) - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); - } + + return ppr < old_ppr; +} + +static void apic_update_ppr(struct kvm_lapic *apic) +{ + u32 ppr; + + if (__apic_update_ppr(apic, &ppr) && + apic_has_interrupt_for_ppr(apic, ppr) != -1) + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); +} + +void kvm_apic_update_ppr(struct kvm_vcpu *vcpu) +{ + apic_update_ppr(vcpu->arch.apic); } +EXPORT_SYMBOL_GPL(kvm_apic_update_ppr); static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) { @@ -579,10 +626,8 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda) { - if (apic_x2apic_mode(apic)) - return mda == X2APIC_BROADCAST; - - return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST; + return mda == (apic_x2apic_mode(apic) ? + X2APIC_BROADCAST : APIC_BROADCAST); } static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) @@ -591,9 +636,18 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) return true; if (apic_x2apic_mode(apic)) - return mda == kvm_apic_id(apic); + return mda == kvm_x2apic_id(apic); + + /* + * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if + * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and + * this allows unique addressing of VCPUs with APIC ID over 0xff. + * The 0xff condition is needed because writeable xAPIC ID. + */ + if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic)) + return true; - return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic)); + return mda == kvm_xapic_id(apic); } static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) @@ -610,7 +664,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) && (logical_id & mda & 0xffff) != 0; logical_id = GET_APIC_LOGICAL_ID(logical_id); - mda = GET_APIC_DEST_FIELD(mda); switch (kvm_lapic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: @@ -627,9 +680,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) /* The KVM local APIC implementation has two quirks: * - * - the xAPIC MDA stores the destination at bits 24-31, while this - * is not true of struct kvm_lapic_irq's dest_id field. This is - * just a quirk in the API and is not problematic. + * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs + * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID. + * KVM doesn't do that aliasing. * * - in-kernel IOAPIC messages have to be delivered directly to * x2APIC, because the kernel does not support interrupt remapping. @@ -645,13 +698,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; - bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && - !ipi && dest_id == APIC_BROADCAST && x2apic_mda) + !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target)) return X2APIC_BROADCAST; - return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); + return dest_id; } bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, @@ -1907,9 +1959,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; - apic_debug("%s: vcpu=%p, id=%d, base_msr=" + apic_debug("%s: vcpu=%p, id=0x%x, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, - vcpu, kvm_apic_id(apic), + vcpu, kvm_lapic_get_reg(apic, APIC_ID), vcpu->arch.apic_base, apic->base_address); } @@ -2021,17 +2073,13 @@ nomem: int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - int highest_irr; + u32 ppr; if (!apic_enabled(apic)) return -1; - apic_update_ppr(apic); - highest_irr = apic_find_highest_irr(apic); - if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI))) - return -1; - return highest_irr; + __apic_update_ppr(apic, &ppr); + return apic_has_interrupt_for_ppr(apic, ppr); } int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) @@ -2067,6 +2115,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) { int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; + u32 ppr; if (vector == -1) return -1; @@ -2078,13 +2127,23 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) * because the process would deliver it through the IDT. */ - apic_set_isr(vector, apic); - apic_update_ppr(apic); apic_clear_irr(vector, apic); - if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { - apic_clear_isr(vector, apic); + /* + * For auto-EOI interrupts, there might be another pending + * interrupt above PPR, so check whether to raise another + * KVM_REQ_EVENT. + */ apic_update_ppr(apic); + } else { + /* + * For normal interrupts, PPR has been raised and there cannot + * be a higher-priority pending interrupt---except if there was + * a concurrent interrupt injection, but that would have + * triggered KVM_REQ_EVENT already. + */ + apic_set_isr(vector, apic); + __apic_update_ppr(apic, &ppr); } return vector; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ff8039d61672..05abd837b78a 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -73,6 +73,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); +void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); @@ -203,17 +204,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } -static inline u32 kvm_apic_id(struct kvm_lapic *apic) -{ - /* To avoid a race between apic_base and following APIC_ID update when - * switching to x2apic_mode, the x2apic mode returns initial x2apic id. - */ - if (apic_x2apic_mode(apic)) - return apic->vcpu->vcpu_id; - - return kvm_lapic_get_reg(apic, APIC_ID) >> 24; -} - bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7012de4a1fed..64821ca3a7c3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -37,6 +37,8 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/hash.h> +#include <linux/kern_levels.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -129,6 +131,10 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +/* The mask for the R/X bits in EPT PTEs */ +#define PT64_EPT_READABLE_MASK 0x1ull +#define PT64_EPT_EXECUTABLE_MASK 0x4ull + #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -178,6 +184,25 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_present_mask; +/* + * The mask/value to distinguish a PTE that has been marked not-present for + * access tracking purposes. + * The mask would be either 0 if access tracking is disabled, or + * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. + */ +static u64 __read_mostly shadow_acc_track_mask; +static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. + */ +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | + PT64_EPT_EXECUTABLE_MASK; +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; + static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -187,6 +212,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static inline bool is_access_track_spte(u64 spte) +{ + /* Always false if shadow_acc_track_mask is zero. */ + return (spte & shadow_acc_track_mask) == shadow_acc_track_value; +} + /* * the low bit of the generation number is always presumed to be zero. * This disables mmio caching during memslot updates. The concept is @@ -284,7 +315,8 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) } void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -292,9 +324,23 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; shadow_present_mask = p_mask; + shadow_acc_track_mask = acc_track_mask; + WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +void kvm_mmu_clear_all_pte_masks(void) +{ + shadow_user_mask = 0; + shadow_accessed_mask = 0; + shadow_dirty_mask = 0; + shadow_nx_mask = 0; + shadow_x_mask = 0; + shadow_mmio_mask = 0; + shadow_present_mask = 0; + shadow_acc_track_mask = 0; +} + static int is_cpuid_PSE36(void) { return 1; @@ -307,7 +353,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); + return (pte != 0) && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -473,7 +519,7 @@ retry: } #endif -static bool spte_is_locklessly_modifiable(u64 spte) +static bool spte_can_locklessly_be_made_writable(u64 spte) { return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); @@ -481,36 +527,38 @@ static bool spte_is_locklessly_modifiable(u64 spte) static bool spte_has_volatile_bits(u64 spte) { + if (!is_shadow_present_pte(spte)) + return false; + /* * Always atomically update spte if it can be updated * out of mmu-lock, it can ensure dirty bit is not lost, * also, it can help us to get a stable is_writable_pte() * to ensure tlb flush is not missed. */ - if (spte_is_locklessly_modifiable(spte)) + if (spte_can_locklessly_be_made_writable(spte) || + is_access_track_spte(spte)) return true; - if (!shadow_accessed_mask) - return false; - - if (!is_shadow_present_pte(spte)) - return false; - - if ((spte & shadow_accessed_mask) && - (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) - return false; + if (shadow_accessed_mask) { + if ((spte & shadow_accessed_mask) == 0 || + (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) + return true; + } - return true; + return false; } -static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) +static bool is_accessed_spte(u64 spte) { - return (old_spte & bit_mask) && !(new_spte & bit_mask); + return shadow_accessed_mask ? spte & shadow_accessed_mask + : !is_access_track_spte(spte); } -static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) +static bool is_dirty_spte(u64 spte) { - return (old_spte & bit_mask) != (new_spte & bit_mask); + return shadow_dirty_mask ? spte & shadow_dirty_mask + : spte & PT_WRITABLE_MASK; } /* Rules for using mmu_spte_set: @@ -525,25 +573,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) __set_spte(sptep, new_spte); } -/* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changed. - * - * Whenever we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB, the return value indicates this - * case. +/* + * Update the SPTE (excluding the PFN), but do not track changes in its + * accessed/dirty status. */ -static bool mmu_spte_update(u64 *sptep, u64 new_spte) +static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; - bool ret = false; WARN_ON(!is_shadow_present_pte(new_spte)); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); - return ret; + return old_spte; } if (!spte_has_volatile_bits(old_spte)) @@ -551,45 +593,62 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); + WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); + + return old_spte; +} + +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changed. + * + * Whenever we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB, the return value indicates this + * case. + * + * Returns true if the TLB needs to be flushed + */ +static bool mmu_spte_update(u64 *sptep, u64 new_spte) +{ + bool flush = false; + u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); + + if (!is_shadow_present_pte(old_spte)) + return false; + /* * For the spte updated out of mmu-lock is safe, since * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ - if (spte_is_locklessly_modifiable(old_spte) && + if (spte_can_locklessly_be_made_writable(old_spte) && !is_writable_pte(new_spte)) - ret = true; - - if (!shadow_accessed_mask) { - /* - * We don't set page dirty when dropping non-writable spte. - * So do it now if the new spte is becoming non-writable. - */ - if (ret) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - return ret; - } + flush = true; /* - * Flush TLB when accessed/dirty bits are changed in the page tables, + * Flush TLB when accessed/dirty states are changed in the page tables, * to guarantee consistency between TLB and page tables. */ - if (spte_is_bit_changed(old_spte, new_spte, - shadow_accessed_mask | shadow_dirty_mask)) - ret = true; - if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) + if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { + flush = true; kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) + } + + if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { + flush = true; kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + } - return ret; + return flush; } /* * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. + * Returns non-zero if the PTE was previously valid. */ static int mmu_spte_clear_track_bits(u64 *sptep) { @@ -613,11 +672,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep) */ WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); - if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) + if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); - if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : - PT_WRITABLE_MASK)) + + if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); + return 1; } @@ -636,6 +696,61 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } +static u64 mark_spte_for_access_track(u64 spte) +{ + if (shadow_accessed_mask != 0) + return spte & ~shadow_accessed_mask; + + if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) + return spte; + + /* + * Verify that the write-protection that we do below will be fixable + * via the fast page fault path. Currently, that is always the case, at + * least when using EPT (which is when access tracking would be used). + */ + WARN_ONCE((spte & PT_WRITABLE_MASK) && + !spte_can_locklessly_be_made_writable(spte), + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift), + "kvm: Access Tracking saved bit locations are not zero\n"); + + spte |= (spte & shadow_acc_track_saved_bits_mask) << + shadow_acc_track_saved_bits_shift; + spte &= ~shadow_acc_track_mask; + spte |= shadow_acc_track_value; + + return spte; +} + +/* Returns the Accessed status of the PTE and resets it at the same time. */ +static bool mmu_spte_age(u64 *sptep) +{ + u64 spte = mmu_spte_get_lockless(sptep); + + if (!is_accessed_spte(spte)) + return false; + + if (shadow_accessed_mask) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + } else { + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(spte)) + kvm_set_pfn_dirty(spte_to_pfn(spte)); + + spte = mark_spte_for_access_track(spte); + mmu_spte_update_no_track(sptep, spte); + } + + return true; +} + static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { /* @@ -1212,7 +1327,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) u64 spte = *sptep; if (!is_writable_pte(spte) && - !(pt_protect && spte_is_locklessly_modifiable(spte))) + !(pt_protect && spte_can_locklessly_be_made_writable(spte))) return false; rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); @@ -1420,7 +1535,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); + sptep, *sptep, gfn, level); need_flush = 1; @@ -1433,7 +1548,8 @@ restart: new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - new_spte &= ~shadow_accessed_mask; + + new_spte = mark_spte_for_access_track(new_spte); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -1595,15 +1711,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator uninitialized_var(iter); int young = 0; - BUG_ON(!shadow_accessed_mask); - - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (*sptep & shadow_accessed_mask) { - young = 1; - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } - } + for_each_rmap_spte(rmap_head, &iter, sptep) + young |= mmu_spte_age(sptep); trace_kvm_age_page(gfn, level, slot, young); return young; @@ -1615,24 +1724,20 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, { u64 *sptep; struct rmap_iterator iter; - int young = 0; /* - * If there's no access bit in the secondary pte set by the - * hardware it's up to gup-fast/gup to set the access bit in - * the primary pte or in the page structure. + * If there's no access bit in the secondary pte set by the hardware and + * fast access tracking is also not enabled, it's up to gup-fast/gup to + * set the access bit in the primary pte or in the page structure. */ - if (!shadow_accessed_mask) + if (!shadow_accessed_mask && !shadow_acc_track_mask) goto out; - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (*sptep & shadow_accessed_mask) { - young = 1; - break; - } - } + for_each_rmap_spte(rmap_head, &iter, sptep) + if (is_accessed_spte(*sptep)) + return 1; out: - return young; + return 0; } #define RMAP_RECYCLE_THRESHOLD 1000 @@ -1660,7 +1765,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) + if (!shadow_accessed_mask && !shadow_acc_track_mask) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); @@ -1713,7 +1818,7 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp) static unsigned kvm_page_table_hashfn(gfn_t gfn) { - return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); + return hash_64(gfn, KVM_MMU_HASH_SHIFT); } static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, @@ -1904,17 +2009,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, * since it has been deleted from active_mmu_pages but still can be found * at hast list. * - * for_each_gfn_valid_sp() has skipped that kind of pages. + * for_each_valid_sp() has skipped that kind of pages. */ -#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ +#define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \ - || (_sp)->role.invalid) {} else + if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ + } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ - if ((_sp)->role.direct) {} else + for_each_valid_sp(_kvm, _sp, _gfn) \ + if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else /* @sp->gfn should be write-protected at the call site */ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -2116,6 +2221,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; bool need_sync = false; bool flush = false; + int collisions = 0; LIST_HEAD(invalid_list); role = vcpu->arch.mmu.base_role; @@ -2130,7 +2236,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) { + for_each_valid_sp(vcpu->kvm, sp, gfn) { + if (sp->gfn != gfn) { + collisions++; + continue; + } + if (!need_sync && sp->unsync) need_sync = true; @@ -2153,7 +2264,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); - return sp; + goto out; } ++vcpu->kvm->stat.mmu_cache_miss; @@ -2183,6 +2294,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); +out: + if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) + vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; } @@ -2583,6 +2697,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_dirty_mask; } + if (speculative) + spte = mark_spte_for_access_track(spte); + set_pte: if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); @@ -2636,7 +2753,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, + *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, *sptep, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -2869,31 +2986,56 @@ static bool page_fault_can_be_fast(u32 error_code) if (unlikely(error_code & PFERR_RSVD_MASK)) return false; + /* See if the page fault is due to an NX violation */ + if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) + == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) + return false; + /* - * #PF can be fast only if the shadow page table is present and it - * is caused by write-protect, that means we just need change the - * W bit of the spte which can be done out of mmu-lock. + * #PF can be fast if: + * 1. The shadow page table entry is not present, which could mean that + * the fault is potentially caused by access tracking (if enabled). + * 2. The shadow page table entry is present and the fault + * is caused by write-protect, that means we just need change the W + * bit of the spte which can be done out of mmu-lock. + * + * However, if access tracking is disabled we know that a non-present + * page must be a genuine page fault where we have to create a new SPTE. + * So, if access tracking is disabled, we return true only for write + * accesses to a present page. */ - if (!(error_code & PFERR_PRESENT_MASK) || - !(error_code & PFERR_WRITE_MASK)) - return false; - return true; + return shadow_acc_track_mask != 0 || + ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) + == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); } +/* + * Returns true if the SPTE was fixed successfully. Otherwise, + * someone else modified the SPTE from its original value. + */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *sptep, u64 spte) + u64 *sptep, u64 old_spte, + bool remove_write_prot, bool remove_acc_track) { gfn_t gfn; + u64 new_spte = old_spte; WARN_ON(!sp->role.direct); - /* - * The gfn of direct spte is stable since it is calculated - * by sp->gfn. - */ - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + if (remove_acc_track) { + u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift) + & shadow_acc_track_saved_bits_mask; + + new_spte &= ~shadow_acc_track_mask; + new_spte &= ~(shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift); + new_spte |= saved_bits; + } + + if (remove_write_prot) + new_spte |= PT_WRITABLE_MASK; /* * Theoretically we could also set dirty bit (and flush TLB) here in @@ -2907,8 +3049,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) + if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) + return false; + + if (remove_write_prot) { + /* + * The gfn of direct spte is stable since it is + * calculated by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); kvm_vcpu_mark_page_dirty(vcpu, gfn); + } return true; } @@ -2923,8 +3074,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - bool ret = false; + bool fault_handled = false; u64 spte = 0ull; + uint retry_count = 0; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return false; @@ -2937,62 +3089,99 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, if (!is_shadow_present_pte(spte) || iterator.level < level) break; - /* - * If the mapping has been changed, let the vcpu fault on the - * same address again. - */ - if (!is_shadow_present_pte(spte)) { - ret = true; - goto exit; - } + do { + bool remove_write_prot = false; + bool remove_acc_track; - sp = page_header(__pa(iterator.sptep)); - if (!is_last_spte(spte, sp->role.level)) - goto exit; + sp = page_header(__pa(iterator.sptep)); + if (!is_last_spte(spte, sp->role.level)) + break; - /* - * Check if it is a spurious fault caused by TLB lazily flushed. - * - * Need not check the access of upper level table entries since - * they are always ACC_ALL. - */ - if (is_writable_pte(spte)) { - ret = true; - goto exit; - } + /* + * Check whether the memory access that caused the fault would + * still cause it if it were to be performed right now. If not, + * then this is a spurious fault caused by TLB lazily flushed, + * or some other CPU has already fixed the PTE after the + * current CPU took the fault. + * + * Need not check the access of upper level table entries since + * they are always ACC_ALL. + */ - /* - * Currently, to simplify the code, only the spte write-protected - * by dirty-log can be fast fixed. - */ - if (!spte_is_locklessly_modifiable(spte)) - goto exit; + if (error_code & PFERR_FETCH_MASK) { + if ((spte & (shadow_x_mask | shadow_nx_mask)) + == shadow_x_mask) { + fault_handled = true; + break; + } + } else if (error_code & PFERR_WRITE_MASK) { + if (is_writable_pte(spte)) { + fault_handled = true; + break; + } - /* - * Do not fix write-permission on the large spte since we only dirty - * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() - * that means other pages are missed if its slot is dirty-logged. - * - * Instead, we let the slow page fault path create a normal spte to - * fix the access. - * - * See the comments in kvm_arch_commit_memory_region(). - */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) - goto exit; + /* + * Currently, to simplify the code, write-protection can + * be removed in the fast path only if the SPTE was + * write-protected for dirty-logging. + */ + remove_write_prot = + spte_can_locklessly_be_made_writable(spte); + } else { + /* Fault was on Read access */ + if (spte & PT_PRESENT_MASK) { + fault_handled = true; + break; + } + } + + remove_acc_track = is_access_track_spte(spte); + + /* Verify that the fault can be handled in the fast path */ + if (!remove_acc_track && !remove_write_prot) + break; + + /* + * Do not fix write-permission on the large spte since we only + * dirty the first page into the dirty-bitmap in + * fast_pf_fix_direct_spte() that means other pages are missed + * if its slot is dirty-logged. + * + * Instead, we let the slow page fault path create a normal spte + * to fix the access. + * + * See the comments in kvm_arch_commit_memory_region(). + */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot) + break; + + /* + * Currently, fast page fault only works for direct mapping + * since the gfn is not stable for indirect shadow page. See + * Documentation/virtual/kvm/locking.txt to get more detail. + */ + fault_handled = fast_pf_fix_direct_spte(vcpu, sp, + iterator.sptep, spte, + remove_write_prot, + remove_acc_track); + if (fault_handled) + break; + + if (++retry_count > 4) { + printk_once(KERN_WARNING + "kvm: Fast #PF retrying more than 4 times.\n"); + break; + } + + spte = mmu_spte_get_lockless(iterator.sptep); + + } while (true); - /* - * Currently, fast page fault only works for direct mapping since - * the gfn is not stable for indirect shadow page. - * See Documentation/virtual/kvm/locking.txt to get more detail. - */ - ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); -exit: trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, - spte, ret); + spte, fault_handled); walk_shadow_page_lockless_end(vcpu); - return ret; + return fault_handled; } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, @@ -5063,6 +5252,8 @@ static void mmu_destroy_caches(void) int kvm_mmu_module_init(void) { + kvm_mmu_clear_all_pte_masks(); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, 0, NULL); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 08a4d3ab3455..d0414f054bdf 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4182,6 +4182,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); + vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF); + if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a236decb81e4..4e691035a32d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5236,10 +5236,10 @@ static void ept_set_mmio_spte_mask(void) /* * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). - * Also, magic bits (0x3ull << 62) is set to quickly identify mmio - * spte. + * Also, special bit (62) is set to quickly identify mmio spte. */ - kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); + kvm_mmu_set_mmio_spte_mask(SPTE_SPECIAL_MASK | + VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 @@ -6152,7 +6152,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_apic_update_ppr(vcpu); return 1; } @@ -6374,14 +6374,20 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - /* it is a read fault? */ - error_code = (exit_qualification << 2) & PFERR_USER_MASK; - /* it is a write fault? */ - error_code |= exit_qualification & PFERR_WRITE_MASK; - /* It is a fetch fault? */ - error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; - /* ept page table is present? */ - error_code |= (exit_qualification & 0x38) != 0; + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & + (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | + EPT_VIOLATION_EXECUTABLE)) + ? PFERR_PRESENT_MASK : 0; vcpu->arch.exit_qualification = exit_qualification; @@ -6572,6 +6578,19 @@ static void wakeup_handler(void) spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } +void vmx_enable_tdp(void) +{ + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, + enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK); + + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6697,16 +6716,9 @@ static __init int hardware_setup(void) /* SELF-IPI */ vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); - if (enable_ept) { - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? - 0ull : VMX_EPT_READABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else + if (enable_ept) + vmx_enable_tdp(); + else kvm_disable_tdp(); update_ple_window_actual_max(); @@ -7168,9 +7180,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) - return 1; - if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); return kvm_skip_emulated_instruction(vcpu); @@ -7182,6 +7191,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + return 1; + if (cpu_has_vmx_msr_bitmap()) { vmx->nested.msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 57d8a856cdc5..6e2c71ea0627 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "irq_injections", VCPU_STAT(irq_injections) }, { "nmi_injections", VCPU_STAT(nmi_injections) }, + { "req_event", VCPU_STAT(req_event) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -190,6 +191,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, + { "max_mmu_page_hash_collisions", + VM_STAT(max_mmu_page_hash_collisions) }, { NULL } }; @@ -3896,7 +3899,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, goto split_irqchip_unlock; /* Pairs with irqchip_in_kernel. */ smp_wmb(); - kvm->arch.irqchip_split = true; + kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; r = 0; split_irqchip_unlock: @@ -3959,40 +3962,41 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; case KVM_CREATE_IRQCHIP: { - struct kvm_pic *vpic; - mutex_lock(&kvm->lock); + r = -EEXIST; - if (kvm->arch.vpic) + if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; + r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; - r = -ENOMEM; - vpic = kvm_create_pic(kvm); - if (vpic) { - r = kvm_ioapic_init(kvm); - if (r) { - mutex_lock(&kvm->slots_lock); - kvm_destroy_pic(vpic); - mutex_unlock(&kvm->slots_lock); - goto create_irqchip_unlock; - } - } else + + r = kvm_pic_init(kvm); + if (r) + goto create_irqchip_unlock; + + r = kvm_ioapic_init(kvm); + if (r) { + mutex_lock(&kvm->slots_lock); + kvm_pic_destroy(kvm); + mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; + } + r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); - kvm_destroy_pic(vpic); + kvm_pic_destroy(kvm); mutex_unlock(&kvm->irq_lock); mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } - /* Write kvm->irq_routing before kvm->arch.vpic. */ + /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); - kvm->arch.vpic = vpic; + kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -4028,7 +4032,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -4052,7 +4056,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) @@ -4461,6 +4465,21 @@ out: } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, + gpa_t gpa, bool write) +{ + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + return 1; + + if (vcpu_match_mmio_gpa(vcpu, gpa)) { + trace_vcpu_match_mmio(gva, gpa, write, true); + return 1; + } + + return 0; +} + static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) @@ -4487,16 +4506,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if (*gpa == UNMAPPED_GVA) return -1; - /* For APIC access vmexit */ - if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - return 1; - - if (vcpu_match_mmio_gpa(vcpu, *gpa)) { - trace_vcpu_match_mmio(gva, *gpa, write, true); - return 1; - } - - return 0; + return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -4593,6 +4603,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + /* + * If the exit was due to a NPF we may already have a GPA. + * If the GPA is present, use it to avoid the GVA to GPA table walk. + * Note, this cannot be used on string operations since string + * operation using rep will only have the initial GPA from the NPF + * occurred. + */ + if (vcpu->arch.gpa_available && + emulator_can_use_gpa(ctxt) && + vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) && + (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) { + gpa = exception->address; + goto mmio; + } ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -5609,6 +5635,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } restart: + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2; + r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) @@ -6024,7 +6053,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK); + PT_PRESENT_MASK, 0); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -6731,6 +6760,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + ++vcpu->stat.req_event; kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { r = 1; diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index cddd5d06e1cb..3603556fa0d9 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -280,6 +280,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ |