diff options
-rw-r--r-- | Documentation/virtual/kvm/hypercalls.txt | 35 | ||||
-rw-r--r-- | arch/x86/include/asm/vmx.h | 12 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/kvm_para.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 148 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 33 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 69 | ||||
-rw-r--r-- | include/uapi/linux/kvm_para.h | 2 |
8 files changed, 208 insertions, 106 deletions
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt index c8d040e27046..feaaa634f154 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -81,3 +81,38 @@ the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) is used in the hypercall for future use. + + +6. KVM_HC_CLOCK_PAIRING +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to synchronize host and guest clocks. +Usage: + +a0: guest physical address where host copies +"struct kvm_clock_offset" structure. + +a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0) +is supported (corresponding to the host's CLOCK_REALTIME clock). + + struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; + }; + + Where: + * sec: seconds from clock_type clock. + * nsec: nanoseconds from clock_type clock. + * tsc: guest TSC value used to calculate sec/nsec pair + * flags: flags, unused (0) at the moment. + +The hypercall lets a guest compute a precise timestamp across +host and guest. The guest can use the returned TSC value to +compute the CLOCK_REALTIME for its clock, at the same instant. + +Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, +or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index a22a4790f1ac..cc54b7026567 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -510,15 +510,15 @@ struct vmx_msr_entry { /* * Exit Qualifications for EPT Violations */ -#define EPT_VIOLATION_READ_BIT 0 -#define EPT_VIOLATION_WRITE_BIT 1 -#define EPT_VIOLATION_INSTR_BIT 2 +#define EPT_VIOLATION_ACC_READ_BIT 0 +#define EPT_VIOLATION_ACC_WRITE_BIT 1 +#define EPT_VIOLATION_ACC_INSTR_BIT 2 #define EPT_VIOLATION_READABLE_BIT 3 #define EPT_VIOLATION_WRITABLE_BIT 4 #define EPT_VIOLATION_EXECUTABLE_BIT 5 -#define EPT_VIOLATION_READ (1 << EPT_VIOLATION_READ_BIT) -#define EPT_VIOLATION_WRITE (1 << EPT_VIOLATION_WRITE_BIT) -#define EPT_VIOLATION_INSTR (1 << EPT_VIOLATION_INSTR_BIT) +#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) +#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) +#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) #define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) #define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) #define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 1421a6585126..cff0bb6556f8 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -50,6 +50,15 @@ struct kvm_steal_time { __u32 pad[11]; }; +#define KVM_CLOCK_PAIRING_WALLCLOCK 0 +struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; +}; + #define KVM_STEAL_ALIGNMENT_BITS 5 #define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) #define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 09c2ac741567..c0e2036217ad 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -861,12 +861,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) if (!best) best = check_cpuid_limit(vcpu, function, index); - /* - * Perfmon not yet supported for L2 guest. - */ - if (is_guest_mode(vcpu) && function == 0xa) - best = NULL; - if (best) { *eax = best->eax; *ebx = best->ebx; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 64821ca3a7c3..2fd7586aad4d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -208,7 +208,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { - shadow_mmio_mask = mmio_mask; + shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); @@ -318,6 +318,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, u64 acc_track_mask) { + if (acc_track_mask != 0) + acc_track_mask |= SPTE_SPECIAL_MASK; + shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; @@ -370,6 +373,11 @@ static int is_last_spte(u64 pte, int level) return 0; } +static bool is_executable_pte(u64 spte) +{ + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; +} + static kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -705,9 +713,9 @@ static u64 mark_spte_for_access_track(u64 spte) return spte; /* - * Verify that the write-protection that we do below will be fixable - * via the fast page fault path. Currently, that is always the case, at - * least when using EPT (which is when access tracking would be used). + * Making an Access Tracking PTE will result in removal of write access + * from the PTE. So, verify that we will be able to restore the write + * access in the fast page fault path later on. */ WARN_ONCE((spte & PT_WRITABLE_MASK) && !spte_can_locklessly_be_made_writable(spte), @@ -725,6 +733,23 @@ static u64 mark_spte_for_access_track(u64 spte) return spte; } +/* Restore an acc-track PTE back to a regular PTE */ +static u64 restore_acc_track_spte(u64 spte) +{ + u64 new_spte = spte; + u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) + & shadow_acc_track_saved_bits_mask; + + WARN_ON_ONCE(!is_access_track_spte(spte)); + + new_spte &= ~shadow_acc_track_mask; + new_spte &= ~(shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift); + new_spte |= saved_bits; + + return new_spte; +} + /* Returns the Accessed status of the PTE and resets it at the same time. */ static bool mmu_spte_age(u64 *sptep) { @@ -3016,27 +3041,12 @@ static bool page_fault_can_be_fast(u32 error_code) */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *sptep, u64 old_spte, - bool remove_write_prot, bool remove_acc_track) + u64 *sptep, u64 old_spte, u64 new_spte) { gfn_t gfn; - u64 new_spte = old_spte; WARN_ON(!sp->role.direct); - if (remove_acc_track) { - u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift) - & shadow_acc_track_saved_bits_mask; - - new_spte &= ~shadow_acc_track_mask; - new_spte &= ~(shadow_acc_track_saved_bits_mask << - shadow_acc_track_saved_bits_shift); - new_spte |= saved_bits; - } - - if (remove_write_prot) - new_spte |= PT_WRITABLE_MASK; - /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in @@ -3052,7 +3062,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) return false; - if (remove_write_prot) { + if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) { /* * The gfn of direct spte is stable since it is * calculated by sp->gfn. @@ -3064,6 +3074,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return true; } +static bool is_access_allowed(u32 fault_err_code, u64 spte) +{ + if (fault_err_code & PFERR_FETCH_MASK) + return is_executable_pte(spte); + + if (fault_err_code & PFERR_WRITE_MASK) + return is_writable_pte(spte); + + /* Fault was on Read access */ + return spte & PT_PRESENT_MASK; +} + /* * Return value: * - true: let the vcpu to access on the same address again. @@ -3085,13 +3107,14 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, return false; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) - if (!is_shadow_present_pte(spte) || iterator.level < level) - break; do { - bool remove_write_prot = false; - bool remove_acc_track; + u64 new_spte; + + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + if (!is_shadow_present_pte(spte) || + iterator.level < level) + break; sp = page_header(__pa(iterator.sptep)); if (!is_last_spte(spte, sp->role.level)) @@ -3107,52 +3130,44 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, * Need not check the access of upper level table entries since * they are always ACC_ALL. */ + if (is_access_allowed(error_code, spte)) { + fault_handled = true; + break; + } - if (error_code & PFERR_FETCH_MASK) { - if ((spte & (shadow_x_mask | shadow_nx_mask)) - == shadow_x_mask) { - fault_handled = true; - break; - } - } else if (error_code & PFERR_WRITE_MASK) { - if (is_writable_pte(spte)) { - fault_handled = true; - break; - } + new_spte = spte; + + if (is_access_track_spte(spte)) + new_spte = restore_acc_track_spte(new_spte); + + /* + * Currently, to simplify the code, write-protection can + * be removed in the fast path only if the SPTE was + * write-protected for dirty-logging or access tracking. + */ + if ((error_code & PFERR_WRITE_MASK) && + spte_can_locklessly_be_made_writable(spte)) + { + new_spte |= PT_WRITABLE_MASK; /* - * Currently, to simplify the code, write-protection can - * be removed in the fast path only if the SPTE was - * write-protected for dirty-logging. + * Do not fix write-permission on the large spte. Since + * we only dirty the first page into the dirty-bitmap in + * fast_pf_fix_direct_spte(), other pages are missed + * if its slot has dirty logging enabled. + * + * Instead, we let the slow page fault path create a + * normal spte to fix the access. + * + * See the comments in kvm_arch_commit_memory_region(). */ - remove_write_prot = - spte_can_locklessly_be_made_writable(spte); - } else { - /* Fault was on Read access */ - if (spte & PT_PRESENT_MASK) { - fault_handled = true; + if (sp->role.level > PT_PAGE_TABLE_LEVEL) break; - } } - remove_acc_track = is_access_track_spte(spte); - /* Verify that the fault can be handled in the fast path */ - if (!remove_acc_track && !remove_write_prot) - break; - - /* - * Do not fix write-permission on the large spte since we only - * dirty the first page into the dirty-bitmap in - * fast_pf_fix_direct_spte() that means other pages are missed - * if its slot is dirty-logged. - * - * Instead, we let the slow page fault path create a normal spte - * to fix the access. - * - * See the comments in kvm_arch_commit_memory_region(). - */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot) + if (new_spte == spte || + !is_access_allowed(error_code, new_spte)) break; /* @@ -3162,8 +3177,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, - remove_write_prot, - remove_acc_track); + new_spte); if (fault_handled) break; @@ -3173,8 +3187,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, break; } - spte = mmu_spte_get_lockless(iterator.sptep); - } while (true); trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4e691035a32d..7c3e42623090 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4953,7 +4953,7 @@ static bool vmx_get_enable_apicv(void) return enable_apicv; } -static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; @@ -4964,19 +4964,15 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmx->nested.pi_pending) { vmx->nested.pi_pending = false; if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return 0; + return; max_irr = find_last_bit( (unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr == 256) - return 0; + return; vapic_page = kmap(vmx->nested.virtual_apic_page); - if (!vapic_page) { - WARN_ON(1); - return -ENOMEM; - } __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); kunmap(vmx->nested.virtual_apic_page); @@ -4987,7 +4983,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_INTR_STATUS, status); } } - return 0; } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) @@ -5236,10 +5231,8 @@ static void ept_set_mmio_spte_mask(void) /* * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). - * Also, special bit (62) is set to quickly identify mmio spte. */ - kvm_mmu_set_mmio_spte_mask(SPTE_SPECIAL_MASK | - VMX_EPT_MISCONFIG_WX_VALUE); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 @@ -6375,13 +6368,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) trace_kvm_page_fault(gpa, exit_qualification); /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_READ) + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) ? PFERR_USER_MASK : 0; /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_WRITE) + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) ? PFERR_WRITE_MASK : 0; /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_INSTR) + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) ? PFERR_FETCH_MASK : 0; /* ept page table entry is present? */ error_code |= (exit_qualification & @@ -6585,7 +6578,7 @@ void vmx_enable_tdp(void) enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK, cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK); + enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); @@ -8203,8 +8196,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_TASK_SWITCH: return true; case EXIT_REASON_CPUID: - if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) - return false; return true; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); @@ -9742,11 +9733,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, return false; } msr_bitmap_l1 = (unsigned long *)kmap(page); - if (!msr_bitmap_l1) { - nested_release_page_clean(page); - WARN_ON(1); - return false; - } memset(msr_bitmap_l0, 0xff, PAGE_SIZE); @@ -10708,7 +10694,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } - return vmx_complete_nested_posted_interrupt(vcpu); + vmx_complete_nested_posted_interrupt(vcpu); + return 0; } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6e2c71ea0627..09e5d31dac98 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1142,6 +1142,7 @@ struct pvclock_gtod_data { u64 boot_ns; u64 nsec_base; + u64 wall_time_sec; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1165,6 +1166,8 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->boot_ns = boot_ns; vdata->nsec_base = tk->tkr_mono.xtime_nsec; + vdata->wall_time_sec = tk->xtime_sec; + write_seqcount_end(&vdata->seq); } #endif @@ -1626,6 +1629,28 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now) return mode; } +static int do_realtime(struct timespec *ts, u64 *cycle_now) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + unsigned long seq; + int mode; + u64 ns; + + do { + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->wall_time_sec; + ns = gtod->nsec_base; + ns += vgettsc(cycle_now); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + /* returns true if host is using tsc clocksource */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) { @@ -1635,6 +1660,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; } + +/* returns true if host is using tsc clocksource */ +static bool kvm_get_walltime_and_clockread(struct timespec *ts, + u64 *cycle_now) +{ + /* checked again under seqlock below */ + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + return false; + + return do_realtime(ts, cycle_now) == VCLOCK_TSC; +} #endif /* @@ -5952,9 +5988,6 @@ static void kvm_set_mmio_spte_mask(void) /* Mask the reserved physical address bits. */ mask = rsvd_bits(maxphyaddr, 51); - /* Bit 62 is always reserved for 32bit host. */ - mask |= 0x3ull << 62; - /* Set the present bit. */ mask |= 1ull; @@ -6115,6 +6148,33 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, + unsigned long clock_type) +{ + struct kvm_clock_pairing clock_pairing; + struct timespec ts; + cycle_t cycle; + int ret; + + if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) + return -KVM_EOPNOTSUPP; + + if (kvm_get_walltime_and_clockread(&ts, &cycle) == false) + return -KVM_EOPNOTSUPP; + + clock_pairing.sec = ts.tv_sec; + clock_pairing.nsec = ts.tv_nsec; + clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle); + clock_pairing.flags = 0; + + ret = 0; + if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing, + sizeof(struct kvm_clock_pairing))) + ret = -KVM_EFAULT; + + return ret; +} + /* * kvm_pv_kick_cpu_op: Kick a vcpu. * @@ -6179,6 +6239,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); ret = 0; break; + case KVM_HC_CLOCK_PAIRING: + ret = kvm_pv_clock_pairing(vcpu, a0, a1); + break; default: ret = -KVM_ENOSYS; break; diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index bf6cd7d5cac2..fed506aeff62 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -14,6 +14,7 @@ #define KVM_EFAULT EFAULT #define KVM_E2BIG E2BIG #define KVM_EPERM EPERM +#define KVM_EOPNOTSUPP 95 #define KVM_HC_VAPIC_POLL_IRQ 1 #define KVM_HC_MMU_OP 2 @@ -23,6 +24,7 @@ #define KVM_HC_MIPS_GET_CLOCK_FREQ 6 #define KVM_HC_MIPS_EXIT_VM 7 #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 +#define KVM_HC_CLOCK_PAIRING 9 /* * hypercalls use architecture specific |