diff options
Diffstat (limited to 'arch/x86')
78 files changed, 1746 insertions, 1136 deletions
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 55c98fdd67d2..18d15d1ce87d 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -178,7 +178,7 @@ static unsigned long get_cmdline_acpi_rsdp(void) { unsigned long addr = 0; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE char val[MAX_ADDR_LEN] = { }; int ret; diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 1b5d17a9f70d..cf1f13c82175 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -10,6 +10,7 @@ #include <asm/coco.h> #include <asm/tdx.h> #include <asm/vmx.h> +#include <asm/ia32.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/pgtable.h> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d813160b14d8..6356060caaf3 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -26,6 +26,7 @@ #include <xen/events.h> #endif +#include <asm/apic.h> #include <asm/desc.h> #include <asm/traps.h> #include <asm/vdso.h> @@ -167,7 +168,96 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) } } -/* Handles int $0x80 */ +#ifdef CONFIG_IA32_EMULATION +static __always_inline bool int80_is_external(void) +{ + const unsigned int offs = (0x80 / 32) * 0x10; + const u32 bit = BIT(0x80 % 32); + + /* The local APIC on XENPV guests is fake */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* + * If vector 0x80 is set in the APIC ISR then this is an external + * interrupt. Either from broken hardware or injected by a VMM. + * + * Note: In guest mode this is only valid for secure guests where + * the secure module fully controls the vAPIC exposed to the guest. + */ + return apic_read(APIC_ISR + offs) & bit; +} + +/** + * int80_emulation - 32-bit legacy syscall entry + * + * This entry point can be used by 32-bit and 64-bit programs to perform + * 32-bit system calls. Instances of INT $0x80 can be found inline in + * various programs and libraries. It is also used by the vDSO's + * __kernel_vsyscall fallback for hardware that doesn't support a faster + * entry method. Restarted 32-bit system calls also fall back to INT + * $0x80 regardless of what instruction was originally used to do the + * system call. + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. + * + * The arguments for the INT $0x80 based syscall are on stack in the + * pt_regs structure: + * eax: system call number + * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 + */ +DEFINE_IDTENTRY_RAW(int80_emulation) +{ + int nr; + + /* Kernel does not use INT $0x80! */ + if (unlikely(!user_mode(regs))) { + irqentry_enter(regs); + instrumentation_begin(); + panic("Unexpected external interrupt 0x80\n"); + } + + /* + * Establish kernel context for instrumentation, including for + * int80_is_external() below which calls into the APIC driver. + * Identical for soft and external interrupts. + */ + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* Validate that this is a soft interrupt to the extent possible */ + if (unlikely(int80_is_external())) + panic("Unexpected external interrupt 0x80\n"); + + /* + * The low level idtentry code pushed -1 into regs::orig_ax + * and regs::ax contains the syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#else /* CONFIG_IA32_EMULATION */ + +/* Handles int $0x80 on a 32bit kernel */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { int nr = syscall_32_enter(regs); @@ -186,6 +276,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) instrumentation_end(); syscall_exit_to_user_mode(regs); } +#endif /* !CONFIG_IA32_EMULATION */ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 27c05d08558a..de94e2e84ecc 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -275,80 +275,3 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR int3 SYM_CODE_END(entry_SYSCALL_compat) - -/* - * 32-bit legacy system call entry. - * - * 32-bit x86 Linux system calls traditionally used the INT $0x80 - * instruction. INT $0x80 lands here. - * - * This entry point can be used by 32-bit and 64-bit programs to perform - * 32-bit system calls. Instances of INT $0x80 can be found inline in - * various programs and libraries. It is also used by the vDSO's - * __kernel_vsyscall fallback for hardware that doesn't support a faster - * entry method. Restarted 32-bit system calls also fall back to INT - * $0x80 regardless of what instruction was originally used to do the - * system call. - * - * This is considered a slow path. It is not used by most libc - * implementations on modern hardware except during process startup. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp arg6 - */ -SYM_CODE_START(entry_INT80_compat) - UNWIND_HINT_ENTRY - ENDBR - /* - * Interrupts are off on entry. - */ - ASM_CLAC /* Do this early to minimize exposure */ - ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV - - /* - * User tracing code (ptrace or signal handlers) might assume that - * the saved RAX contains a 32-bit number when we're invoking a 32-bit - * syscall. Just in case the high bits are nonzero, zero-extend - * the syscall number. (This could almost certainly be deleted - * with no ill effects.) - */ - movl %eax, %eax - - /* switch to thread stack expects orig_ax and rdi to be pushed */ - pushq %rax /* pt_regs->orig_ax */ - - /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax - - /* In the Xen PV case we already run on the thread stack. */ - ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV - - movq %rsp, %rax - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp - - pushq 5*8(%rax) /* regs->ss */ - pushq 4*8(%rax) /* regs->rsp */ - pushq 3*8(%rax) /* regs->eflags */ - pushq 2*8(%rax) /* regs->cs */ - pushq 1*8(%rax) /* regs->ip */ - pushq 0*8(%rax) /* regs->orig_ax */ -.Lint80_keep_stack: - - PUSH_AND_CLEAR_REGS rax=$-ENOSYS - UNWIND_HINT_REGS - - cld - - IBRS_ENTER - UNTRAIN_RET - - movq %rsp, %rdi - call do_int80_syscall_32 - jmp swapgs_restore_regs_and_return_to_usermode -SYM_CODE_END(entry_INT80_compat) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a08f794a0e79..ce1c777227b4 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4660,7 +4660,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) if (pmu->intel_cap.pebs_output_pt_available) pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; else - pmu->pmu.capabilities |= ~PERF_PMU_CAP_AUX_OUTPUT; + pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT; intel_pmu_check_event_constraints(pmu->event_constraints, pmu->num_counters, diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 21556ad87f4b..8f3a4d16bb79 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -15,6 +15,7 @@ #include <linux/io.h> #include <asm/apic.h> #include <asm/desc.h> +#include <asm/e820/api.h> #include <asm/sev.h> #include <asm/ibt.h> #include <asm/hypervisor.h> @@ -286,15 +287,31 @@ static int hv_cpu_die(unsigned int cpu) static int __init hv_pci_init(void) { - int gen2vm = efi_enabled(EFI_BOOT); + bool gen2vm = efi_enabled(EFI_BOOT); /* - * For Generation-2 VM, we exit from pci_arch_init() by returning 0. - * The purpose is to suppress the harmless warning: + * A Generation-2 VM doesn't support legacy PCI/PCIe, so both + * raw_pci_ops and raw_pci_ext_ops are NULL, and pci_subsys_init() -> + * pcibios_init() doesn't call pcibios_resource_survey() -> + * e820__reserve_resources_late(); as a result, any emulated persistent + * memory of E820_TYPE_PRAM (12) via the kernel parameter + * memmap=nn[KMG]!ss is not added into iomem_resource and hence can't be + * detected by register_e820_pmem(). Fix this by directly calling + * e820__reserve_resources_late() here: e820__reserve_resources_late() + * depends on e820__reserve_resources(), which has been called earlier + * from setup_arch(). Note: e820__reserve_resources_late() also adds + * any memory of E820_TYPE_PMEM (7) into iomem_resource, and + * acpi_nfit_register_region() -> acpi_nfit_insert_resource() -> + * region_intersects() returns REGION_INTERSECTS, so the memory of + * E820_TYPE_PMEM won't get added twice. + * + * We return 0 here so that pci_arch_init() won't print the warning: * "PCI: Fatal: No config space access function found" */ - if (gen2vm) + if (gen2vm) { + e820__reserve_resources_late(); return 0; + } /* For Generation-1 VM, we'll proceed in pci_arch_init(). */ return 1; diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index c8a7fc23f63c..f896eed4516c 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -16,6 +16,9 @@ #include <asm/x86_init.h> #include <asm/cpufeature.h> #include <asm/irq_vectors.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> #ifdef CONFIG_ACPI_APEI # include <asm/pgtable_types.h> @@ -127,6 +130,17 @@ static inline void arch_acpi_set_proc_cap_bits(u32 *cap) if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_option_idle_override == IDLE_NOMWAIT) *cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH); + + if (xen_initial_domain()) { + /* + * When Linux is running as Xen dom0, the hypervisor is the + * entity in charge of the processor power management, and so + * Xen needs to check the OS capabilities reported in the + * processor capabilities buffer matches what the hypervisor + * driver supports. + */ + xen_sanitize_proc_cap_bits(cap); + } } static inline bool acpi_has_cpu_in_madt(void) diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 5a2ae24b1204..9805629479d9 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -75,6 +75,11 @@ static inline bool ia32_enabled(void) return __ia32_enabled; } +static inline void ia32_disable(void) +{ + __ia32_enabled = false; +} + #else /* !CONFIG_IA32_EMULATION */ static inline bool ia32_enabled(void) @@ -82,6 +87,8 @@ static inline bool ia32_enabled(void) return IS_ENABLED(CONFIG_X86_32); } +static inline void ia32_disable(void) {} + #endif #endif /* _ASM_X86_IA32_H */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 05fd175cec7d..13639e57e1f8 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -569,6 +569,10 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); +#if defined(CONFIG_IA32_EMULATION) +DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR, int80_emulation); +#endif + #ifdef CONFIG_X86_MCE #ifdef CONFIG_X86_64 DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 26b628d84594..378ed944b849 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -55,8 +55,10 @@ KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) KVM_X86_OP(flush_tlb_all) KVM_X86_OP(flush_tlb_current) +#if IS_ENABLED(CONFIG_HYPERV) KVM_X86_OP_OPTIONAL(flush_remote_tlbs) KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range) +#endif KVM_X86_OP(flush_tlb_gva) KVM_X86_OP(flush_tlb_guest) KVM_X86_OP(vcpu_pre_run) @@ -135,6 +137,7 @@ KVM_X86_OP(msr_filter_changed) KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); +KVM_X86_OP_OPTIONAL(get_untagged_addr) #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index 6c98f4bb4228..058bc636356a 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -22,7 +22,7 @@ KVM_X86_PMU_OP(get_msr) KVM_X86_PMU_OP(set_msr) KVM_X86_PMU_OP(refresh) KVM_X86_PMU_OP(init) -KVM_X86_PMU_OP(reset) +KVM_X86_PMU_OP_OPTIONAL(reset) KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a565a2e70f30..c5e362e2116f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -133,7 +133,8 @@ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ - | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP)) + | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ + | X86_CR4_LAM_SUP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -500,8 +501,23 @@ struct kvm_pmc { u8 idx; bool is_paused; bool intr; + /* + * Base value of the PMC counter, relative to the *consumed* count in + * the associated perf_event. This value includes counter updates from + * the perf_event and emulated_count since the last time the counter + * was reprogrammed, but it is *not* the current value as seen by the + * guest or userspace. + * + * The count is relative to the associated perf_event so that KVM + * doesn't need to reprogram the perf_event every time the guest writes + * to the counter. + */ u64 counter; - u64 prev_counter; + /* + * PMC events triggered by KVM emulation that haven't been fully + * processed, i.e. haven't undergone overflow detection. + */ + u64 emulated_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; @@ -937,8 +953,10 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; +#ifdef CONFIG_KVM_HYPERV bool hyperv_enabled; struct kvm_vcpu_hv *hyperv; +#endif #ifdef CONFIG_KVM_XEN struct kvm_vcpu_xen xen; #endif @@ -1095,6 +1113,7 @@ enum hv_tsc_page_status { HV_TSC_PAGE_BROKEN, }; +#ifdef CONFIG_KVM_HYPERV /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; @@ -1125,9 +1144,9 @@ struct kvm_hv { */ unsigned int synic_auto_eoi_used; - struct hv_partition_assist_pg *hv_pa_pg; struct kvm_hv_syndbg hv_syndbg; }; +#endif struct msr_bitmap_range { u32 flags; @@ -1136,6 +1155,7 @@ struct msr_bitmap_range { unsigned long *bitmap; }; +#ifdef CONFIG_KVM_XEN /* Xen emulation context */ struct kvm_xen { struct mutex xen_lock; @@ -1147,6 +1167,7 @@ struct kvm_xen { struct idr evtchn_ports; unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; }; +#endif enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, @@ -1348,8 +1369,13 @@ struct kvm_arch { /* reads protected by irq_srcu, writes by irq_lock */ struct hlist_head mask_notifier_list; +#ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; +#endif + +#ifdef CONFIG_KVM_XEN struct kvm_xen xen; +#endif bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; @@ -1442,6 +1468,7 @@ struct kvm_arch { #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; + struct hv_partition_assist_pg *hv_pa_pg; #endif /* * VM-scope maximum vCPU ID. Used to determine the size of structures @@ -1614,9 +1641,11 @@ struct kvm_x86_ops { void (*flush_tlb_all)(struct kvm_vcpu *vcpu); void (*flush_tlb_current)(struct kvm_vcpu *vcpu); +#if IS_ENABLED(CONFIG_HYPERV) int (*flush_remote_tlbs)(struct kvm *kvm); int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages); +#endif /* * Flush any TLB entries associated with the given GVA. @@ -1762,6 +1791,8 @@ struct kvm_x86_ops { * Returns vCPU specific APICv inhibit reasons */ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); + + gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); }; struct kvm_x86_nested_ops { @@ -1825,6 +1856,7 @@ static inline struct kvm *kvm_arch_alloc_vm(void) #define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); +#if IS_ENABLED(CONFIG_HYPERV) #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { @@ -1836,6 +1868,15 @@ static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) } #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE +static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, + u64 nr_pages) +{ + if (!kvm_x86_ops.flush_remote_tlbs_range) + return -EOPNOTSUPP; + + return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages); +} +#endif /* CONFIG_HYPERV */ #define kvm_arch_pmi_in_guest(vcpu) \ ((vcpu) && (vcpu)->arch.handling_intr_from_guest) diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 4d84122bd643..484f4f0131a5 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -32,10 +32,6 @@ void entry_SYSCALL_compat(void); void entry_SYSCALL_compat_safe_stack(void); void entry_SYSRETL_compat_unsafe_stack(void); void entry_SYSRETL_compat_end(void); -void entry_INT80_compat(void); -#ifdef CONFIG_XEN_PV -void xen_entry_INT80_compat(void); -#endif #else /* !CONFIG_IA32_EMULATION */ #define entry_SYSCALL_compat NULL #define entry_SYSENTER_compat NULL diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index fd2669b1cb2d..21f9407be5d3 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -86,9 +86,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); return sys_ni_syscall(); \ } -#define __SYS_NI(abi, name) \ - SYSCALL_ALIAS(__##abi##_##name, sys_ni_posix_timers); - #ifdef CONFIG_X86_64 #define __X64_SYS_STUB0(name) \ __SYS_STUB0(x64, sys_##name) @@ -100,13 +97,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X64_COND_SYSCALL(name) \ __COND_SYSCALL(x64, sys_##name) -#define __X64_SYS_NI(name) \ - __SYS_NI(x64, sys_##name) #else /* CONFIG_X86_64 */ #define __X64_SYS_STUB0(name) #define __X64_SYS_STUBx(x, name, ...) #define __X64_COND_SYSCALL(name) -#define __X64_SYS_NI(name) #endif /* CONFIG_X86_64 */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) @@ -120,13 +114,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, sys_##name) -#define __IA32_SYS_NI(name) \ - __SYS_NI(ia32, sys_##name) #else /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #define __IA32_SYS_STUB0(name) #define __IA32_SYS_STUBx(x, name, ...) #define __IA32_COND_SYSCALL(name) -#define __IA32_SYS_NI(name) #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #ifdef CONFIG_IA32_EMULATION @@ -135,8 +126,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the * ia32 regs in the proper order for shared or "common" syscalls. As some * syscalls may not be implemented, we need to expand COND_SYSCALL in - * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this - * case as well. + * kernel/sys_ni.c to cover this case as well. */ #define __IA32_COMPAT_SYS_STUB0(name) \ __SYS_STUB0(ia32, compat_sys_##name) @@ -148,14 +138,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, compat_sys_##name) -#define __IA32_COMPAT_SYS_NI(name) \ - __SYS_NI(ia32, compat_sys_##name) - #else /* CONFIG_IA32_EMULATION */ #define __IA32_COMPAT_SYS_STUB0(name) #define __IA32_COMPAT_SYS_STUBx(x, name, ...) #define __IA32_COMPAT_COND_SYSCALL(name) -#define __IA32_COMPAT_SYS_NI(name) #endif /* CONFIG_IA32_EMULATION */ @@ -175,13 +161,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(x64, compat_sys_##name) -#define __X32_COMPAT_SYS_NI(name) \ - __SYS_NI(x64, compat_sys_##name) #else /* CONFIG_X86_X32_ABI */ #define __X32_COMPAT_SYS_STUB0(name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) #define __X32_COMPAT_COND_SYSCALL(name) -#define __X32_COMPAT_SYS_NI(name) #endif /* CONFIG_X86_X32_ABI */ @@ -212,17 +195,12 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); /* * As some compat syscalls may not be implemented, we need to expand - * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in - * kernel/time/posix-stubs.c to cover this case as well. + * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well. */ #define COND_SYSCALL_COMPAT(name) \ __IA32_COMPAT_COND_SYSCALL(name) \ __X32_COMPAT_COND_SYSCALL(name) -#define COMPAT_SYS_NI(name) \ - __IA32_COMPAT_SYS_NI(name) \ - __X32_COMPAT_SYS_NI(name) - #endif /* CONFIG_COMPAT */ #define __SYSCALL_DEFINEx(x, name, ...) \ @@ -243,8 +221,8 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not * hurt, we only need to re-define it here to keep the naming congruent to - * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI() - * macros to work correctly. + * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() macro + * to work correctly. */ #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ @@ -257,10 +235,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); __X64_COND_SYSCALL(name) \ __IA32_COND_SYSCALL(name) -#define SYS_NI(name) \ - __X64_SYS_NI(name) \ - __IA32_SYS_NI(name) - /* * For VSYSCALLS, we need to declare these three syscalls with the new diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 7048dfacc04b..a9088250770f 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -100,4 +100,13 @@ static inline void leave_lazy(enum xen_lazy_mode mode) enum xen_lazy_mode xen_get_lazy_mode(void); +#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI) +void xen_sanitize_proc_cap_bits(uint32_t *buf); +#else +static inline void xen_sanitize_proc_cap_bits(uint32_t *buf) +{ + BUG(); +} +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index d0918a75cb00..85a3ce2a3666 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -63,6 +63,7 @@ int acpi_fix_pin2_polarity __initdata; #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; +static bool has_lapic_cpus __initdata; static bool acpi_support_online_capable; #endif @@ -233,6 +234,14 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) return 0; /* + * According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure + * when MADT provides both valid LAPIC and x2APIC entries, the APIC ID + * in x2APIC must be equal or greater than 0xff. + */ + if (has_lapic_cpus && apic_id < 0xff) + return 0; + + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size * cpus_possible_map more accurately, to permit @@ -284,6 +293,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) processor->processor_id, /* ACPI ID */ processor->lapic_flags & ACPI_MADT_ENABLED); + has_lapic_cpus = true; return 0; } @@ -1114,10 +1124,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) static int __init acpi_parse_madt_lapic_entries(void) { - int count; - int x2count = 0; - int ret; - struct acpi_subtable_proc madt_proc[2]; + int count, x2count = 0; if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; @@ -1126,21 +1133,10 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { - memset(madt_proc, 0, sizeof(madt_proc)); - madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; - madt_proc[0].handler = acpi_parse_lapic; - madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; - madt_proc[1].handler = acpi_parse_x2apic; - ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, - sizeof(struct acpi_table_madt), - madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); - if (ret < 0) { - pr_err("Error parsing LAPIC/X2APIC entries\n"); - return ret; - } - - count = madt_proc[0].count; - x2count = madt_proc[1].count; + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, + acpi_parse_lapic, MAX_LOCAL_APIC); + x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, + acpi_parse_x2apic, MAX_LOCAL_APIC); } if (!count && !x2count) { pr_err("No LAPIC entries present\n"); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 73be3931e4f0..aae7456ece07 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -255,6 +255,16 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) } } +static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len) +{ + unsigned long flags; + + local_irq_save(flags); + optimize_nops(instr, len); + sync_core(); + local_irq_restore(flags); +} + /* * In this context, "source" is where the instructions are placed in the * section .altinstr_replacement, for example during kernel build by the @@ -438,7 +448,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - optimize_nops(instr, a->instrlen); + optimize_nops_inplace(instr, a->instrlen); continue; } @@ -1685,8 +1695,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } else { local_irq_save(flags); memcpy(addr, opcode, len); - local_irq_restore(flags); sync_core(); + local_irq_restore(flags); /* * Could also do a CLFLUSH here to speed up CPU recovery; but diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a7eab05e5f29..f322ebd053a9 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1320,6 +1320,9 @@ static void zenbleed_check_cpu(void *unused) void amd_check_microcode(void) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + on_each_cpu(zenbleed_check_cpu, NULL, 1); } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 9373ec01c5ae..13b45b9c806d 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -104,8 +104,6 @@ struct cont_desc { size_t size; }; -static u32 ucode_new_rev; - /* * Microcode patch container file is prepended to the initrd in cpio * format. See Documentation/arch/x86/microcode.rst @@ -442,12 +440,11 @@ static int __apply_microcode_amd(struct microcode_amd *mc) * * Returns true if container found (sets @desc), false otherwise. */ -static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size) +static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size) { struct cont_desc desc = { 0 }; struct microcode_amd *mc; bool ret = false; - u32 rev, dummy; desc.cpuid_1_eax = cpuid_1_eax; @@ -457,22 +454,15 @@ static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size) if (!mc) return ret; - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* * Allow application of the same revision to pick up SMT-specific * changes even if the revision of the other SMT thread is already * up-to-date. */ - if (rev > mc->hdr.patch_id) + if (old_rev > mc->hdr.patch_id) return ret; - if (!__apply_microcode_amd(mc)) { - ucode_new_rev = mc->hdr.patch_id; - ret = true; - } - - return ret; + return !__apply_microcode_amd(mc); } static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) @@ -506,9 +496,12 @@ static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpi *ret = cp; } -void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) +void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax) { struct cpio_data cp = { }; + u32 dummy; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy); /* Needed in load_microcode_amd() */ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax; @@ -517,7 +510,8 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) if (!(cp.data && cp.size)) return; - early_apply_microcode(cpuid_1_eax, cp.data, cp.size); + if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size)) + native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy); } static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); @@ -625,10 +619,8 @@ void reload_ucode_amd(unsigned int cpu) rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); if (rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) { - ucode_new_rev = mc->hdr.patch_id; - pr_info("reload patch_level=0x%08x\n", ucode_new_rev); - } + if (!__apply_microcode_amd(mc)) + pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id); } } @@ -649,8 +641,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) if (p && (p->patch_id == csig->rev)) uci->mc = p->data; - pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); - return 0; } @@ -691,8 +681,6 @@ static enum ucode_state apply_microcode_amd(int cpu) rev = mc_amd->hdr.patch_id; ret = UCODE_UPDATED; - pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); - out: uci->cpu_sig.rev = rev; c->microcode = rev; @@ -935,11 +923,6 @@ struct microcode_ops * __init init_amd_microcode(void) pr_warn("AMD CPU family 0x%x not supported\n", c->x86); return NULL; } - - if (ucode_new_rev) - pr_info_once("microcode updated early to new patch_level=0x%08x\n", - ucode_new_rev); - return µcode_amd_ops; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 666d25bbc5ad..232026a239a6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -41,8 +41,6 @@ #include "internal.h" -#define DRIVER_VERSION "2.2" - static struct microcode_ops *microcode_ops; bool dis_ucode_ldr = true; @@ -77,6 +75,8 @@ static u32 final_levels[] = { 0, /* T-101 terminator */ }; +struct early_load_data early_data; + /* * Check the current patch level on this CPU. * @@ -155,9 +155,9 @@ void __init load_ucode_bsp(void) return; if (intel) - load_ucode_intel_bsp(); + load_ucode_intel_bsp(&early_data); else - load_ucode_amd_bsp(cpuid_1_eax); + load_ucode_amd_bsp(&early_data, cpuid_1_eax); } void load_ucode_ap(void) @@ -828,6 +828,11 @@ static int __init microcode_init(void) if (!microcode_ops) return -ENODEV; + pr_info_once("Current revision: 0x%08x\n", (early_data.new_rev ?: early_data.old_rev)); + + if (early_data.new_rev) + pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev); + microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) return PTR_ERR(microcode_pdev); @@ -846,8 +851,6 @@ static int __init microcode_init(void) cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); - pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION); - return 0; out_pdev: diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 6024feb98d29..070426b9895f 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -339,16 +339,9 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, static enum ucode_state apply_microcode_early(struct ucode_cpu_info *uci) { struct microcode_intel *mc = uci->mc; - enum ucode_state ret; - u32 cur_rev, date; + u32 cur_rev; - ret = __apply_microcode(uci, mc, &cur_rev); - if (ret == UCODE_UPDATED) { - date = mc->hdr.date; - pr_info_once("updated early: 0x%x -> 0x%x, date = %04x-%02x-%02x\n", - cur_rev, mc->hdr.rev, date & 0xffff, date >> 24, (date >> 16) & 0xff); - } - return ret; + return __apply_microcode(uci, mc, &cur_rev); } static __init bool load_builtin_intel_microcode(struct cpio_data *cp) @@ -413,13 +406,17 @@ static int __init save_builtin_microcode(void) early_initcall(save_builtin_microcode); /* Load microcode on BSP from initrd or builtin blobs */ -void __init load_ucode_intel_bsp(void) +void __init load_ucode_intel_bsp(struct early_load_data *ed) { struct ucode_cpu_info uci; + ed->old_rev = intel_get_microcode_revision(); + uci.mc = get_microcode_blob(&uci, false); if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) ucode_patch_va = UCODE_BSP_LOADED; + + ed->new_rev = uci.cpu_sig.rev; } void load_ucode_intel_ap(void) diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index f8047b12329a..21776c529fa9 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -37,6 +37,12 @@ struct microcode_ops { use_nmi : 1; }; +struct early_load_data { + u32 old_rev; + u32 new_rev; +}; + +extern struct early_load_data early_data; extern struct ucode_cpu_info ucode_cpu_info[]; struct cpio_data find_microcode_in_initrd(const char *path); @@ -92,14 +98,14 @@ extern bool dis_ucode_ldr; extern bool force_minrev; #ifdef CONFIG_CPU_SUP_AMD -void load_ucode_amd_bsp(unsigned int family); +void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family); void load_ucode_amd_ap(unsigned int family); int save_microcode_in_initrd_amd(unsigned int family); void reload_ucode_amd(unsigned int cpu); struct microcode_ops *init_amd_microcode(void); void exit_amd_microcode(void); #else /* CONFIG_CPU_SUP_AMD */ -static inline void load_ucode_amd_bsp(unsigned int family) { } +static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { } static inline void load_ucode_amd_ap(unsigned int family) { } static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } static inline void reload_ucode_amd(unsigned int cpu) { } @@ -108,12 +114,12 @@ static inline void exit_amd_microcode(void) { } #endif /* !CONFIG_CPU_SUP_AMD */ #ifdef CONFIG_CPU_SUP_INTEL -void load_ucode_intel_bsp(void); +void load_ucode_intel_bsp(struct early_load_data *ed); void load_ucode_intel_ap(void); void reload_ucode_intel(void); struct microcode_ops *init_intel_microcode(void); #else /* CONFIG_CPU_SUP_INTEL */ -static inline void load_ucode_intel_bsp(void) { } +static inline void load_ucode_intel_bsp(struct early_load_data *ed) { } static inline void load_ucode_intel_ap(void) { } static inline void reload_ucode_intel(void) { } static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e6bba12c759c..01fa06dd06b6 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -262,11 +262,14 @@ static uint32_t __init ms_hyperv_platform(void) static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) { static atomic_t nmi_cpu = ATOMIC_INIT(-1); + unsigned int old_cpu, this_cpu; if (!unknown_nmi_panic) return NMI_DONE; - if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1) + old_cpu = -1; + this_cpu = raw_smp_processor_id(); + if (!atomic_try_cmpxchg(&nmi_cpu, &old_cpu, this_cpu)) return NMI_HANDLED; return NMI_DONE; diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 086a2c3aaaa0..0f8103240fda 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -255,6 +255,22 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) testl $X2APIC_ENABLE, %eax jnz .Lread_apicid_msr +#ifdef CONFIG_X86_X2APIC + /* + * If system is in X2APIC mode then MMIO base might not be + * mapped causing the MMIO read below to fault. Faults can't + * be handled at that point. + */ + cmpl $0, x2apic_mode(%rip) + jz .Lread_apicid_mmio + + /* Force the AP into X2APIC mode. */ + orl $X2APIC_ENABLE, %eax + wrmsr + jmp .Lread_apicid_msr +#endif + +.Lread_apicid_mmio: /* Read the APIC ID from the fix-mapped MMIO space. */ movq apic_mmio_base(%rip), %rcx addq $APIC_ID, %rcx diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 8857abc706e4..660b601f1d6c 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -121,7 +121,7 @@ static const __initconst struct idt_data def_idts[] = { static const struct idt_data ia32_idt[] __initconst = { #if defined(CONFIG_IA32_EMULATION) - SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), + SYSG(IA32_SYSCALL_VECTOR, asm_int80_emulation), #elif defined(CONFIG_X86_32) SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32), #endif diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 70472eebe719..c67285824e82 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1234,10 +1234,6 @@ void setup_ghcb(void) if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) return; - /* First make sure the hypervisor talks a supported protocol. */ - if (!sev_es_negotiate_protocol()) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - /* * Check whether the runtime #VC exception handler is active. It uses * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). @@ -1255,6 +1251,13 @@ void setup_ghcb(void) } /* + * Make sure the hypervisor talks a supported protocol. + * This gets called only in the BSP boot phase. + */ + if (!sev_es_negotiate_protocol()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* * Clear the boot_ghcb. The first exception comes in before the bss * section is cleared. */ diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index cacf2ede6217..23d8aaf8d9fd 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -175,9 +175,6 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); uc_flags = frame_uc_flags(regs); - if (setup_signal_shadow_stack(ksig)) - return -EFAULT; - if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; @@ -198,6 +195,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) return -EFAULT; } + if (setup_signal_shadow_stack(ksig)) + return -EFAULT; + /* Set up registers for signal handler */ regs->di = ksig->sig; /* In case the signal handler was declared without prototypes */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index c1716e83d176..87e3da7b0439 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -23,17 +23,15 @@ config KVM depends on HAVE_KVM depends on HIGH_RES_TIMERS depends on X86_LOCAL_APIC - select PREEMPT_NOTIFIERS + select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE - select HAVE_KVM_IRQFD select HAVE_KVM_DIRTY_RING_TSO select HAVE_KVM_DIRTY_RING_ACQ_REL select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING - select HAVE_KVM_EVENTFD select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO @@ -46,7 +44,6 @@ config KVM select KVM_XFER_TO_GUEST_WORK select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO - select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING help @@ -65,13 +62,13 @@ config KVM config KVM_WERROR bool "Compile KVM with -Werror" - # KASAN may cause the build to fail due to larger frames - default y if X86_64 && !KASAN - # We use the dependency on !COMPILE_TEST to not be enabled - # blindly in allmodconfig or allyesconfig configurations - depends on KVM - depends on (X86_64 && !KASAN) || !COMPILE_TEST - depends on EXPERT + # Disallow KVM's -Werror if KASAN is enabled, e.g. to guard against + # randomized configs from selecting KVM_WERROR=y, which doesn't play + # nice with KASAN. KASAN builds generates warnings for the default + # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning. + # Building KVM with -Werror and KASAN is still doable via enabling + # the kernel-wide WERROR=y. + depends on KVM && EXPERT && !KASAN help Add -Werror to the build flags for KVM. @@ -80,7 +77,7 @@ config KVM_WERROR config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT - depends on X86_64 + depends on KVM && X86_64 select KVM_GENERIC_PRIVATE_MEM help Enable support for KVM software-protected VMs. Currently "protected" @@ -141,6 +138,20 @@ config KVM_SMM If unsure, say Y. +config KVM_HYPERV + bool "Support for Microsoft Hyper-V emulation" + depends on KVM + default y + help + Provides KVM support for emulating Microsoft Hyper-V. This allows KVM + to expose a subset of the paravirtualized interfaces defined in the + Hyper-V Hypervisor Top-Level Functional Specification (TLFS): + https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs + These interfaces are required for the correct and performant functioning + of Windows and Hyper-V guests on KVM. + + If unsure, say "Y". + config KVM_XEN bool "Support for Xen hypercall interface" depends on KVM diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 80e3fe184d17..475b5fa917a6 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -11,25 +11,27 @@ include $(srctree)/virt/kvm/Makefile.kvm kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ + debugfs.o mmu/mmu.o mmu/page_track.o \ mmu/spte.o -ifdef CONFIG_HYPERV -kvm-y += kvm_onhyperv.o -endif - kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o +kvm-$(CONFIG_KVM_HYPERV) += hyperv.o kvm-$(CONFIG_KVM_XEN) += xen.o kvm-$(CONFIG_KVM_SMM) += smm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ - vmx/hyperv.o vmx/nested.o vmx/posted_intr.o + vmx/nested.o vmx/posted_intr.o + kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o +kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \ - svm/sev.o svm/hyperv.o + svm/sev.o +kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o ifdef CONFIG_HYPERV +kvm-y += kvm_onhyperv.o +kvm-intel-y += vmx/vmx_onhyperv.o vmx/hyperv_evmcs.o kvm-amd-y += svm/svm_onhyperv.o endif diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index dda6fc4cfae8..294e5bd5f8a0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -314,11 +314,15 @@ EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent) { +#ifdef CONFIG_KVM_HYPERV struct kvm_cpuid_entry2 *entry; entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE, KVM_CPUID_INDEX_NOT_SIGNIFICANT); return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX; +#else + return false; +#endif } static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) @@ -433,11 +437,13 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, return 0; } +#ifdef CONFIG_KVM_HYPERV if (kvm_cpuid_has_hyperv(e2, nent)) { r = kvm_hv_vcpu_init(vcpu); if (r) return r; } +#endif r = kvm_check_cpuid(vcpu, e2, nent); if (r) @@ -469,7 +475,7 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, return -E2BIG; if (cpuid->nent) { - e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent)); + e = vmemdup_array_user(entries, cpuid->nent, sizeof(*e)); if (IS_ERR(e)) return PTR_ERR(e); @@ -513,7 +519,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, return -E2BIG; if (cpuid->nent) { - e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent)); + e2 = vmemdup_array_user(entries, cpuid->nent, sizeof(*e2)); if (IS_ERR(e2)) return PTR_ERR(e2); } @@ -671,7 +677,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_7_1_EAX, F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(FZRM) | F(FSRS) | F(FSRC) | - F(AMX_FP16) | F(AVX_IFMA) + F(AMX_FP16) | F(AVX_IFMA) | F(LAM) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, @@ -679,6 +685,11 @@ void kvm_set_cpu_caps(void) F(AMX_COMPLEX) ); + kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, + F(INTEL_PSFD) | F(IPRED_CTRL) | F(RRSBA_CTRL) | F(DDPD_U) | + F(BHI_CTRL) | F(MCDT_NO) + ); + kvm_cpu_cap_mask(CPUID_D_1_EAX, F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); @@ -960,13 +971,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /* function 7 has additional index. */ case 7: - entry->eax = min(entry->eax, 1u); + max_idx = entry->eax = min(entry->eax, 2u); cpuid_entry_override(entry, CPUID_7_0_EBX); cpuid_entry_override(entry, CPUID_7_ECX); cpuid_entry_override(entry, CPUID_7_EDX); - /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */ - if (entry->eax == 1) { + /* KVM only supports up to 0x7.2, capped above via min(). */ + if (max_idx >= 1) { entry = do_host_cpuid(array, function, 1); if (!entry) goto out; @@ -976,6 +987,16 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx = 0; entry->ecx = 0; } + if (max_idx >= 2) { + entry = do_host_cpuid(array, function, 2); + if (!entry) + goto out; + + cpuid_entry_override(entry, CPUID_7_2_EDX); + entry->ecx = 0; + entry->ebx = 0; + entry->eax = 0; + } break; case 0xa: { /* Architectural Performance Monitoring */ union cpuid10_eax eax; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 0b90532b6e26..856e3037e74f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -47,11 +47,6 @@ static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return !(gpa & vcpu->arch.reserved_gpa_bits); } -static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return !kvm_vcpu_is_legal_gpa(vcpu, gpa); -} - static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, gpa_t alignment) { @@ -279,4 +274,12 @@ static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, vcpu->arch.governed_features.enabled); } +static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (guest_can_use(vcpu, X86_FEATURE_LAM)) + cr3 &= ~(X86_CR3_LAM_U48 | X86_CR3_LAM_U57); + + return kvm_vcpu_is_legal_gpa(vcpu, cr3); +} + #endif diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 42026b3f3ff3..95ea1a1f7403 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -182,6 +182,7 @@ static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file) } static const struct file_operations mmu_rmaps_stat_fops = { + .owner = THIS_MODULE, .open = kvm_mmu_rmaps_stat_open, .read = seq_read, .llseek = seq_lseek, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2673cd5c46cb..e223043ef5b2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -687,8 +687,8 @@ static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size) static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, unsigned *max_size, unsigned size, - bool write, bool fetch, - enum x86emul_mode mode, ulong *linear) + enum x86emul_mode mode, ulong *linear, + unsigned int flags) { struct desc_struct desc; bool usable; @@ -701,7 +701,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, *max_size = 0; switch (mode) { case X86EMUL_MODE_PROT64: - *linear = la; + *linear = la = ctxt->ops->get_untagged_addr(ctxt, la, flags); va_bits = ctxt_virt_addr_bits(ctxt); if (!__is_canonical_address(la, va_bits)) goto bad; @@ -717,11 +717,11 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, if (!usable) goto bad; /* code segment in protected mode or read-only data segment */ - if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) - || !(desc.type & 2)) && write) + if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) || !(desc.type & 2)) && + (flags & X86EMUL_F_WRITE)) goto bad; /* unreadable code segment */ - if (!fetch && (desc.type & 8) && !(desc.type & 2)) + if (!(flags & X86EMUL_F_FETCH) && (desc.type & 8) && !(desc.type & 2)) goto bad; lim = desc_limit_scaled(&desc); if (!(desc.type & 8) && (desc.type & 4)) { @@ -757,8 +757,8 @@ static int linearize(struct x86_emulate_ctxt *ctxt, ulong *linear) { unsigned max_size; - return __linearize(ctxt, addr, &max_size, size, write, false, - ctxt->mode, linear); + return __linearize(ctxt, addr, &max_size, size, ctxt->mode, linear, + write ? X86EMUL_F_WRITE : 0); } static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst) @@ -771,7 +771,8 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst) if (ctxt->op_bytes != sizeof(unsigned long)) addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); - rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear); + rc = __linearize(ctxt, addr, &max_size, 1, ctxt->mode, &linear, + X86EMUL_F_FETCH); if (rc == X86EMUL_CONTINUE) ctxt->_eip = addr.ea; return rc; @@ -907,8 +908,8 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) * boundary check itself. Instead, we use max_size to check * against op_size. */ - rc = __linearize(ctxt, addr, &max_size, 0, false, true, ctxt->mode, - &linear); + rc = __linearize(ctxt, addr, &max_size, 0, ctxt->mode, &linear, + X86EMUL_F_FETCH); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; @@ -3439,8 +3440,10 @@ static int em_invlpg(struct x86_emulate_ctxt *ctxt) { int rc; ulong linear; + unsigned int max_size; - rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear); + rc = __linearize(ctxt, ctxt->src.addr.mem, &max_size, 1, ctxt->mode, + &linear, X86EMUL_F_INVLPG); if (rc == X86EMUL_CONTINUE) ctxt->ops->invlpg(ctxt, linear); /* Disable writeback. */ diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index 423a73395c10..ad463b1ed4e4 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -16,6 +16,7 @@ KVM_GOVERNED_X86_FEATURE(PAUSEFILTER) KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD) KVM_GOVERNED_X86_FEATURE(VGIF) KVM_GOVERNED_X86_FEATURE(VNMI) +KVM_GOVERNED_X86_FEATURE(LAM) #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index f83b8db72b11..1dc0b6604526 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -24,6 +24,8 @@ #include <linux/kvm_host.h> #include "x86.h" +#ifdef CONFIG_KVM_HYPERV + /* "Hv#1" signature */ #define HYPERV_CPUID_SIGNATURE_EAX 0x31237648 @@ -105,6 +107,17 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); +static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector) +{ + return to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->vec_bitmap); +} + +static inline bool kvm_hv_synic_auto_eoi_set(struct kvm_vcpu *vcpu, int vector) +{ + return to_hv_vcpu(vcpu) && + test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap); +} + void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu); @@ -236,6 +249,76 @@ static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu) return kvm_hv_get_assist_page(vcpu); } +static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu, + bool tdp_enabled) +{ + /* + * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or + * L2's VP_ID upon request from the guest. Make sure we check for + * pending entries in the right FIFO upon L1/L2 transition as these + * requests are put by other vCPUs asynchronously. + */ + if (to_hv_vcpu(vcpu) && tdp_enabled) + kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); +} + int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu); +#else /* CONFIG_KVM_HYPERV */ +static inline void kvm_hv_setup_tsc_page(struct kvm *kvm, + struct pvclock_vcpu_time_info *hv_clock) {} +static inline void kvm_hv_request_tsc_page_update(struct kvm *kvm) {} +static inline void kvm_hv_init_vm(struct kvm *kvm) {} +static inline void kvm_hv_destroy_vm(struct kvm *kvm) {} +static inline int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) +{ + return 0; +} +static inline void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) {} +static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + return HV_STATUS_ACCESS_DENIED; +} +static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {} +static inline void kvm_hv_free_pa_page(struct kvm *kvm) {} +static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector) +{ + return false; +} +static inline bool kvm_hv_synic_auto_eoi_set(struct kvm_vcpu *vcpu, int vector) +{ + return false; +} +static inline void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) {} +static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled) {} +static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu) +{ + return 0; +} +static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu) +{ + return vcpu->vcpu_idx; +} +static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu, bool tdp_enabled) {} +#endif /* CONFIG_KVM_HYPERV */ -#endif +#endif /* __ARCH_X86_KVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b2c397dd2bc6..ad9ca8a60144 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -118,8 +118,10 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v) if (!lapic_in_kernel(v)) return v->arch.interrupt.nr; +#ifdef CONFIG_KVM_XEN if (kvm_xen_has_interrupt(v)) return v->kvm->arch.xen.upcall_vector; +#endif if (irqchip_split(v->kvm)) { int vector = v->arch.pending_external_vector; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 16d076a1b91a..68f3f6c26046 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -144,7 +144,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); } - +#ifdef CONFIG_KVM_HYPERV static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -154,6 +154,7 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); } +#endif int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, @@ -163,9 +164,11 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, int r; switch (e->type) { +#ifdef CONFIG_KVM_HYPERV case KVM_IRQ_ROUTING_HV_SINT: return kvm_hv_set_sint(e, kvm, irq_source_id, level, line_status); +#endif case KVM_IRQ_ROUTING_MSI: if (kvm_msi_route_invalid(kvm, e)) @@ -314,11 +317,13 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_msi_route_invalid(kvm, e)) return -EINVAL; break; +#ifdef CONFIG_KVM_HYPERV case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; e->hv_sint.vcpu = ue->u.hv_sint.vcpu; e->hv_sint.sint = ue->u.hv_sint.sint; break; +#endif #ifdef CONFIG_KVM_XEN case KVM_IRQ_ROUTING_XEN_EVTCHN: return kvm_xen_setup_evtchn(kvm, e, ue); @@ -438,5 +443,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, void kvm_arch_irq_routing_update(struct kvm *kvm) { +#ifdef CONFIG_KVM_HYPERV kvm_hv_irq_routing_update(kvm); +#endif } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index be7aeb9b8ea3..e6d149825169 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -88,6 +88,12 @@ struct x86_instruction_info { #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ #define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ +/* x86-specific emulation flags */ +#define X86EMUL_F_WRITE BIT(0) +#define X86EMUL_F_FETCH BIT(1) +#define X86EMUL_F_IMPLICIT BIT(2) +#define X86EMUL_F_INVLPG BIT(3) + struct x86_emulate_ops { void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); /* @@ -224,6 +230,9 @@ struct x86_emulate_ops { int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); + + gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, + unsigned int flags); }; /* Type, address-of, and value of an instruction's operand. */ diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h index f9ca3e7432b2..eefab3dc8498 100644 --- a/arch/x86/kvm/kvm_onhyperv.h +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -10,6 +10,26 @@ int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages); int hv_flush_remote_tlbs(struct kvm *kvm); void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp); +static inline hpa_t hv_get_partition_assist_page(struct kvm_vcpu *vcpu) +{ + /* + * Partition assist page is something which Hyper-V running in L0 + * requires from KVM running in L1 before direct TLB flush for L2 + * guests can be enabled. KVM doesn't currently use the page but to + * comply with TLFS it still needs to be allocated. For now, this + * is a single page shared among all vCPUs. + */ + struct hv_partition_assist_pg **p_hv_pa_pg = + &vcpu->kvm->arch.hv_pa_pg; + + if (!*p_hv_pa_pg) + *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); + + if (!*p_hv_pa_pg) + return INVALID_PAGE; + + return __pa(*p_hv_pa_pg); +} #else /* !CONFIG_HYPERV */ static inline int hv_flush_remote_tlbs(struct kvm *kvm) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 245b20973cae..3242f3da2457 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1475,8 +1475,7 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (to_hv_vcpu(apic->vcpu) && - test_bit(vector, to_hv_synic(apic->vcpu)->vec_bitmap)) + if (kvm_hv_synic_has_vector(apic->vcpu, vector)) kvm_hv_synic_send_eoi(apic->vcpu, vector); kvm_ioapic_send_eoi(apic, vector); @@ -2905,7 +2904,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) */ apic_clear_irr(vector, apic); - if (to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap)) { + if (kvm_hv_synic_auto_eoi_set(vcpu, vector)) { /* * For auto-EOI interrupts, there might be another pending * interrupt above PPR, so check whether to raise another diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bb8c86eefac0..60f21bb4c27b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -146,6 +146,14 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu)); } +static inline unsigned long kvm_get_active_cr3_lam_bits(struct kvm_vcpu *vcpu) +{ + if (!guest_can_use(vcpu, X86_FEATURE_LAM)) + return 0; + + return kvm_read_cr3(vcpu) & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57); +} + static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) { u64 root_hpa = vcpu->arch.mmu->root.hpa; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 59b026b6ad2a..d0590b417d30 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -271,15 +271,11 @@ static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, static inline bool kvm_available_flush_remote_tlbs_range(void) { +#if IS_ENABLED(CONFIG_HYPERV) return kvm_x86_ops.flush_remote_tlbs_range; -} - -int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) -{ - if (!kvm_x86_ops.flush_remote_tlbs_range) - return -EOPNOTSUPP; - - return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages); +#else + return false; +#endif } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); @@ -3806,7 +3802,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) hpa_t root; root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); - root_gfn = root_pgd >> PAGE_SHIFT; + root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT; if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { mmu->root.hpa = kvm_mmu_get_dummy_root(); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index b66a7d47e0e4..0669a8a668ca 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -13,6 +13,7 @@ #endif /* Page table builder macros common to shadow (host) PTEs and guest PTEs. */ +#define __PT_BASE_ADDR_MASK GENMASK_ULL(51, 12) #define __PT_LEVEL_SHIFT(level, bits_per_level) \ (PAGE_SHIFT + ((level) - 1) * (bits_per_level)) #define __PT_INDEX(address, level, bits_per_level) \ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index c85255073f67..4d4e98fe4f35 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -62,7 +62,7 @@ #endif /* Common logic, but per-type values. These also need to be undefined. */ -#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) +#define PT_BASE_ADDR_MASK ((pt_element_t)__PT_BASE_ADDR_MASK) #define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) #define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) #define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 9ae07db6f0f6..87cc6c8809ad 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -127,9 +127,9 @@ static void kvm_perf_overflow(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; /* - * Ignore overflow events for counters that are scheduled to be - * reprogrammed, e.g. if a PMI for the previous event races with KVM's - * handling of a related guest WRMSR. + * Ignore asynchronous overflow events for counters that are scheduled + * to be reprogrammed, e.g. if a PMI for the previous event races with + * KVM's handling of a related guest WRMSR. */ if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) return; @@ -161,6 +161,15 @@ static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc) return 1; } +static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) +{ + u64 sample_period = (-counter_value) & pmc_bitmask(pmc); + + if (!sample_period) + sample_period = pmc_bitmask(pmc) + 1; + return sample_period; +} + static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) @@ -215,17 +224,30 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, return 0; } -static void pmc_pause_counter(struct kvm_pmc *pmc) +static bool pmc_pause_counter(struct kvm_pmc *pmc) { u64 counter = pmc->counter; - - if (!pmc->perf_event || pmc->is_paused) - return; + u64 prev_counter; /* update counter, reset event value to avoid redundant accumulation */ - counter += perf_event_pause(pmc->perf_event, true); + if (pmc->perf_event && !pmc->is_paused) + counter += perf_event_pause(pmc->perf_event, true); + + /* + * Snapshot the previous counter *after* accumulating state from perf. + * If overflow already happened, hardware (via perf) is responsible for + * generating a PMI. KVM just needs to detect overflow on emulated + * counter events that haven't yet been processed. + */ + prev_counter = counter & pmc_bitmask(pmc); + + counter += pmc->emulated_counter; pmc->counter = counter & pmc_bitmask(pmc); + + pmc->emulated_counter = 0; pmc->is_paused = true; + + return pmc->counter < prev_counter; } static bool pmc_resume_counter(struct kvm_pmc *pmc) @@ -250,6 +272,51 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } +static void pmc_release_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + pmc->current_config = 0; + pmc_to_pmu(pmc)->event_count--; + } +} + +static void pmc_stop_counter(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + pmc->counter = pmc_read_counter(pmc); + pmc_release_perf_event(pmc); + } +} + +static void pmc_update_sample_period(struct kvm_pmc *pmc) +{ + if (!pmc->perf_event || pmc->is_paused || + !is_sampling_event(pmc->perf_event)) + return; + + perf_event_period(pmc->perf_event, + get_sample_period(pmc, pmc->counter)); +} + +void pmc_write_counter(struct kvm_pmc *pmc, u64 val) +{ + /* + * Drop any unconsumed accumulated counts, the WRMSR is a write, not a + * read-modify-write. Adjust the counter value so that its value is + * relative to the current count, as reading the current count from + * perf is faster than pausing and repgrogramming the event in order to + * reset it to '0'. Note, this very sneakily offsets the accumulated + * emulated count too, by using pmc_read_counter()! + */ + pmc->emulated_counter = 0; + pmc->counter += val - pmc_read_counter(pmc); + pmc->counter &= pmc_bitmask(pmc); + pmc_update_sample_period(pmc); +} +EXPORT_SYMBOL_GPL(pmc_write_counter); + static int filter_cmp(const void *pa, const void *pb, u64 mask) { u64 a = *(u64 *)pa & mask; @@ -383,14 +450,15 @@ static void reprogram_counter(struct kvm_pmc *pmc) struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 eventsel = pmc->eventsel; u64 new_config = eventsel; + bool emulate_overflow; u8 fixed_ctr_ctrl; - pmc_pause_counter(pmc); + emulate_overflow = pmc_pause_counter(pmc); if (!pmc_event_is_allowed(pmc)) goto reprogram_complete; - if (pmc->counter < pmc->prev_counter) + if (emulate_overflow) __kvm_perf_overflow(pmc, false); if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) @@ -430,7 +498,6 @@ static void reprogram_counter(struct kvm_pmc *pmc) reprogram_complete: clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); - pmc->prev_counter = 0; } void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) @@ -639,32 +706,60 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } -/* refresh PMU settings. This function generally is called when underlying - * settings are changed (such as changes of PMU CPUID by guest VMs), which - * should rarely happen. +static void kvm_pmu_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + + pmu->need_cleanup = false; + + bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX); + + for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { + pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); + if (!pmc) + continue; + + pmc_stop_counter(pmc); + pmc->counter = 0; + pmc->emulated_counter = 0; + + if (pmc_is_gp(pmc)) + pmc->eventsel = 0; + } + + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; + + static_call_cond(kvm_x86_pmu_reset)(vcpu); +} + + +/* + * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID + * and/or PERF_CAPABILITIES. */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) return; + /* + * Stop/release all existing counters/events before realizing the new + * vPMU model. + */ + kvm_pmu_reset(vcpu); + bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); static_call(kvm_x86_pmu_refresh)(vcpu); } -void kvm_pmu_reset(struct kvm_vcpu *vcpu) -{ - static_call(kvm_x86_pmu_reset)(vcpu); -} - void kvm_pmu_init(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); static_call(kvm_x86_pmu_init)(vcpu); - pmu->event_count = 0; - pmu->need_cleanup = false; kvm_pmu_refresh(vcpu); } @@ -700,8 +795,7 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - pmc->prev_counter = pmc->counter; - pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); + pmc->emulated_counter++; kvm_pmu_request_counter_reprogram(pmc); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 1d64113de488..7caeb3d8d4fd 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -66,7 +66,8 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) { u64 counter, enabled, running; - counter = pmc->counter; + counter = pmc->counter + pmc->emulated_counter; + if (pmc->perf_event && !pmc->is_paused) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); @@ -74,29 +75,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } -static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val) -{ - pmc->counter += val - pmc_read_counter(pmc); - pmc->counter &= pmc_bitmask(pmc); -} - -static inline void pmc_release_perf_event(struct kvm_pmc *pmc) -{ - if (pmc->perf_event) { - perf_event_release_kernel(pmc->perf_event); - pmc->perf_event = NULL; - pmc->current_config = 0; - pmc_to_pmu(pmc)->event_count--; - } -} - -static inline void pmc_stop_counter(struct kvm_pmc *pmc) -{ - if (pmc->perf_event) { - pmc->counter = pmc_read_counter(pmc); - pmc_release_perf_event(pmc); - } -} +void pmc_write_counter(struct kvm_pmc *pmc, u64 val); static inline bool pmc_is_gp(struct kvm_pmc *pmc) { @@ -146,25 +125,6 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) return NULL; } -static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) -{ - u64 sample_period = (-counter_value) & pmc_bitmask(pmc); - - if (!sample_period) - sample_period = pmc_bitmask(pmc) + 1; - return sample_period; -} - -static inline void pmc_update_sample_period(struct kvm_pmc *pmc) -{ - if (!pmc->perf_event || pmc->is_paused || - !is_sampling_event(pmc->perf_event)) - return; - - perf_event_period(pmc->perf_event, - get_sample_period(pmc, pmc->counter)); -} - static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -261,7 +221,6 @@ bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_pmu_refresh(struct kvm_vcpu *vcpu); -void kvm_pmu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index b81650678375..aadefcaa9561 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -16,6 +16,7 @@ enum kvm_only_cpuid_leafs { CPUID_7_1_EDX, CPUID_8000_0007_EDX, CPUID_8000_0022_EAX, + CPUID_7_2_EDX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -46,6 +47,14 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) +/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */ +#define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0) +#define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1) +#define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2) +#define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3) +#define X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) +#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) + /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) @@ -80,6 +89,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX}, [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, + [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, }; /* @@ -106,18 +116,19 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) */ static __always_inline u32 __feature_translate(int x86_feature) { - if (x86_feature == X86_FEATURE_SGX1) - return KVM_X86_FEATURE_SGX1; - else if (x86_feature == X86_FEATURE_SGX2) - return KVM_X86_FEATURE_SGX2; - else if (x86_feature == X86_FEATURE_SGX_EDECCSSA) - return KVM_X86_FEATURE_SGX_EDECCSSA; - else if (x86_feature == X86_FEATURE_CONSTANT_TSC) - return KVM_X86_FEATURE_CONSTANT_TSC; - else if (x86_feature == X86_FEATURE_PERFMON_V2) - return KVM_X86_FEATURE_PERFMON_V2; - - return x86_feature; +#define KVM_X86_TRANSLATE_FEATURE(f) \ + case X86_FEATURE_##f: return KVM_X86_FEATURE_##f + + switch (x86_feature) { + KVM_X86_TRANSLATE_FEATURE(SGX1); + KVM_X86_TRANSLATE_FEATURE(SGX2); + KVM_X86_TRANSLATE_FEATURE(SGX_EDECCSSA); + KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC); + KVM_X86_TRANSLATE_FEATURE(PERFMON_V2); + KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL); + default: + return x86_feature; + } } static __always_inline u32 __feature_leaf(int x86_feature) diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h index 02f4784b5d44..d3f8bfc05832 100644 --- a/arch/x86/kvm/svm/hyperv.h +++ b/arch/x86/kvm/svm/hyperv.h @@ -11,6 +11,7 @@ #include "../hyperv.h" #include "svm.h" +#ifdef CONFIG_KVM_HYPERV static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -41,5 +42,13 @@ static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) } void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); +#else /* CONFIG_KVM_HYPERV */ +static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) {} +static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) {} +#endif /* CONFIG_KVM_HYPERV */ #endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 60891b9ce25f..dee62362a360 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -187,7 +187,6 @@ void recalc_intercepts(struct vcpu_svm *svm) */ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { - struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; int i; /* @@ -198,11 +197,16 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) * - Nested hypervisor (L1) is using Hyper-V emulation interface and * tells KVM (L0) there were no changes in MSR bitmap for L2. */ - if (!svm->nested.force_msr_bitmap_recalc && - kvm_hv_hypercall_enabled(&svm->vcpu) && - hve->hv_enlightenments_control.msr_bitmap && - (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS))) - goto set_msrpm_base_pa; +#ifdef CONFIG_KVM_HYPERV + if (!svm->nested.force_msr_bitmap_recalc) { + struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; + + if (kvm_hv_hypercall_enabled(&svm->vcpu) && + hve->hv_enlightenments_control.msr_bitmap && + (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS))) + goto set_msrpm_base_pa; + } +#endif if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; @@ -230,7 +234,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) svm->nested.force_msr_bitmap_recalc = false; +#ifdef CONFIG_KVM_HYPERV set_msrpm_base_pa: +#endif svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); return true; @@ -296,7 +302,7 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) { if (CC(!(save->cr4 & X86_CR4_PAE)) || CC(!(save->cr0 & X86_CR0_PE)) || - CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3))) + CC(!kvm_vcpu_is_legal_cr3(vcpu, save->cr3))) return false; } @@ -363,12 +369,14 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->msrpm_base_pa &= ~0x0fffULL; to->iopm_base_pa &= ~0x0fffULL; +#ifdef CONFIG_KVM_HYPERV /* Hyper-V extensions (Enlightened VMCB) */ if (kvm_hv_hypercall_enabled(vcpu)) { to->clean = from->clean; memcpy(&to->hv_enlightenments, &from->hv_enlightenments, sizeof(to->hv_enlightenments)); } +#endif } void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, @@ -472,14 +480,8 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) { - /* - * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or - * L2's VP_ID upon request from the guest. Make sure we check for - * pending entries in the right FIFO upon L1/L2 transition as these - * requests are put by other vCPUs asynchronously. - */ - if (to_hv_vcpu(vcpu) && npt_enabled) - kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + /* Handle pending Hyper-V TLB flush requests */ + kvm_hv_nested_transtion_tlb_flush(vcpu, npt_enabled); /* * TODO: optimize unconditional TLB flush/MMU sync. A partial list of @@ -505,7 +507,7 @@ static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_npt, bool reload_pdptrs) { - if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) + if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) return -EINVAL; if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) && diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 373ff6a6687b..b6a7ad4d6914 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -161,7 +161,6 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { pmc_write_counter(pmc, data); - pmc_update_sample_period(pmc); return 0; } /* MSR_EVNTSELn */ @@ -233,21 +232,6 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) } } -static void amd_pmu_reset(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - int i; - - for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC; i++) { - struct kvm_pmc *pmc = &pmu->gp_counters[i]; - - pmc_stop_counter(pmc); - pmc->counter = pmc->prev_counter = pmc->eventsel = 0; - } - - pmu->global_ctrl = pmu->global_status = 0; -} - struct kvm_pmu_ops amd_pmu_ops __initdata = { .hw_event_available = amd_hw_event_available, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, @@ -259,7 +243,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .set_msr = amd_pmu_set_msr, .refresh = amd_pmu_refresh, .init = amd_pmu_init, - .reset = amd_pmu_reset, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index d0c580607f00..f760106c31f8 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2975,6 +2975,25 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); } + + /* + * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if + * the host/guest supports its use. + * + * guest_can_use() checks a number of requirements on the host/guest to + * ensure that MSR_IA32_XSS is available, but it might report true even + * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host + * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better + * to further check that the guest CPUID actually supports + * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved + * guests will still get intercepted and caught in the normal + * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths. + */ + if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1); + else + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0); } void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c46f07b28230..2171b0cda8d4 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -103,6 +103,7 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, { .index = MSR_IA32_LASTINTTOIP, .always = false }, + { .index = MSR_IA32_XSS, .always = false }, { .index = MSR_EFER, .always = false }, { .index = MSR_IA32_CR_PAT, .always = false }, { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, @@ -1855,15 +1856,17 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) bool old_paging = is_paging(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { vcpu->arch.efer |= EFER_LMA; - svm->vmcb->save.efer |= EFER_LMA | EFER_LME; + if (!vcpu->arch.guest_state_protected) + svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { vcpu->arch.efer &= ~EFER_LMA; - svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); + if (!vcpu->arch.guest_state_protected) + svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } #endif diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index be67ab7fdd10..8ef95139cd24 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -30,7 +30,7 @@ #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 46 +#define MAX_DIRECT_ACCESS_MSRS 47 #define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -148,7 +148,9 @@ struct vmcb_ctrl_area_cached { u64 virt_ext; u32 clean; union { +#if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) struct hv_vmcb_enlightenments hv_enlightenments; +#endif u8 reserved_sw[32]; }; }; diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 7af8422d3382..3971b3ea5d04 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -18,18 +18,14 @@ int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_vmcb_enlightenments *hve; - struct hv_partition_assist_pg **p_hv_pa_pg = - &to_kvm_hv(vcpu->kvm)->hv_pa_pg; + hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); - if (!*p_hv_pa_pg) - *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL); - - if (!*p_hv_pa_pg) + if (partition_assist_page == INVALID_PAGE) return -ENOMEM; hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments; - hve->partition_assist_page = __pa(*p_hv_pa_pg); + hve->partition_assist_page = partition_assist_page; hve->hv_vm_id = (unsigned long)vcpu->kvm; if (!hve->hv_enlightenments_control.nested_flush_hypercall) { hve->hv_enlightenments_control.nested_flush_hypercall = 1; diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index ef2ebabb059c..9499f9c6b077 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -270,16 +270,16 @@ SYM_FUNC_START(__svm_vcpu_run) RESTORE_GUEST_SPEC_CTRL_BODY RESTORE_HOST_SPEC_CTRL_BODY -10: cmpb $0, kvm_rebooting +10: cmpb $0, _ASM_RIP(kvm_rebooting) jne 2b ud2 -30: cmpb $0, kvm_rebooting +30: cmpb $0, _ASM_RIP(kvm_rebooting) jne 4b ud2 -50: cmpb $0, kvm_rebooting +50: cmpb $0, _ASM_RIP(kvm_rebooting) jne 6b ud2 -70: cmpb $0, kvm_rebooting +70: cmpb $0, _ASM_RIP(kvm_rebooting) jne 8b ud2 @@ -381,7 +381,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) RESTORE_GUEST_SPEC_CTRL_BODY RESTORE_HOST_SPEC_CTRL_BODY -3: cmpb $0, kvm_rebooting +3: cmpb $0, _ASM_RIP(kvm_rebooting) jne 2b ud2 diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 313b8bb5b8a7..fab6a1ad98dc 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -13,419 +13,6 @@ #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK -/* - * Enlightened VMCSv1 doesn't support these: - * - * POSTED_INTR_NV = 0x00000002, - * GUEST_INTR_STATUS = 0x00000810, - * APIC_ACCESS_ADDR = 0x00002014, - * POSTED_INTR_DESC_ADDR = 0x00002016, - * EOI_EXIT_BITMAP0 = 0x0000201c, - * EOI_EXIT_BITMAP1 = 0x0000201e, - * EOI_EXIT_BITMAP2 = 0x00002020, - * EOI_EXIT_BITMAP3 = 0x00002022, - * GUEST_PML_INDEX = 0x00000812, - * PML_ADDRESS = 0x0000200e, - * VM_FUNCTION_CONTROL = 0x00002018, - * EPTP_LIST_ADDRESS = 0x00002024, - * VMREAD_BITMAP = 0x00002026, - * VMWRITE_BITMAP = 0x00002028, - * - * TSC_MULTIPLIER = 0x00002032, - * PLE_GAP = 0x00004020, - * PLE_WINDOW = 0x00004022, - * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - * - * Currently unsupported in KVM: - * GUEST_IA32_RTIT_CTL = 0x00002814, - */ -#define EVMCS1_SUPPORTED_PINCTRL \ - (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - PIN_BASED_EXT_INTR_MASK | \ - PIN_BASED_NMI_EXITING | \ - PIN_BASED_VIRTUAL_NMIS) - -#define EVMCS1_SUPPORTED_EXEC_CTRL \ - (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - CPU_BASED_HLT_EXITING | \ - CPU_BASED_CR3_LOAD_EXITING | \ - CPU_BASED_CR3_STORE_EXITING | \ - CPU_BASED_UNCOND_IO_EXITING | \ - CPU_BASED_MOV_DR_EXITING | \ - CPU_BASED_USE_TSC_OFFSETTING | \ - CPU_BASED_MWAIT_EXITING | \ - CPU_BASED_MONITOR_EXITING | \ - CPU_BASED_INVLPG_EXITING | \ - CPU_BASED_RDPMC_EXITING | \ - CPU_BASED_INTR_WINDOW_EXITING | \ - CPU_BASED_CR8_LOAD_EXITING | \ - CPU_BASED_CR8_STORE_EXITING | \ - CPU_BASED_RDTSC_EXITING | \ - CPU_BASED_TPR_SHADOW | \ - CPU_BASED_USE_IO_BITMAPS | \ - CPU_BASED_MONITOR_TRAP_FLAG | \ - CPU_BASED_USE_MSR_BITMAPS | \ - CPU_BASED_NMI_WINDOW_EXITING | \ - CPU_BASED_PAUSE_EXITING | \ - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) - -#define EVMCS1_SUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ - SECONDARY_EXEC_WBINVD_EXITING | \ - SECONDARY_EXEC_ENABLE_VPID | \ - SECONDARY_EXEC_ENABLE_EPT | \ - SECONDARY_EXEC_UNRESTRICTED_GUEST | \ - SECONDARY_EXEC_DESC | \ - SECONDARY_EXEC_ENABLE_RDTSCP | \ - SECONDARY_EXEC_ENABLE_INVPCID | \ - SECONDARY_EXEC_ENABLE_XSAVES | \ - SECONDARY_EXEC_RDSEED_EXITING | \ - SECONDARY_EXEC_RDRAND_EXITING | \ - SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ - SECONDARY_EXEC_PT_USE_GPA | \ - SECONDARY_EXEC_PT_CONCEAL_VMX | \ - SECONDARY_EXEC_BUS_LOCK_DETECTION | \ - SECONDARY_EXEC_NOTIFY_VM_EXITING | \ - SECONDARY_EXEC_ENCLS_EXITING) - -#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) - -#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ - (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_EXIT_SAVE_DEBUG_CONTROLS | \ - VM_EXIT_ACK_INTR_ON_EXIT | \ - VM_EXIT_HOST_ADDR_SPACE_SIZE | \ - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_EXIT_SAVE_IA32_PAT | \ - VM_EXIT_LOAD_IA32_PAT | \ - VM_EXIT_SAVE_IA32_EFER | \ - VM_EXIT_LOAD_IA32_EFER | \ - VM_EXIT_CLEAR_BNDCFGS | \ - VM_EXIT_PT_CONCEAL_PIP | \ - VM_EXIT_CLEAR_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ - (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_ENTRY_LOAD_DEBUG_CONTROLS | \ - VM_ENTRY_IA32E_MODE | \ - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_ENTRY_LOAD_IA32_PAT | \ - VM_ENTRY_LOAD_IA32_EFER | \ - VM_ENTRY_LOAD_BNDCFGS | \ - VM_ENTRY_PT_CONCEAL_PIP | \ - VM_ENTRY_LOAD_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMFUNC (0) - -#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) -#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ - {EVMCS1_OFFSET(name), clean_field} - -const struct evmcs_field vmcs_field_to_evmcs_1[] = { - /* 64 bit rw */ - EVMCS1_FIELD(GUEST_RIP, guest_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_RSP, guest_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR0, host_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR3, host_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR4, host_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_RIP, host_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), - EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(TSC_OFFSET, tsc_offset, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR0, guest_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR3, guest_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR4, guest_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_DR7, guest_dr7, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_RSP, host_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(EPT_POINTER, ept_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), - EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - /* - * Not used by KVM: - * - * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - * EVMCS1_FIELD(0x0000682A, guest_ssp, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - * EVMCS1_FIELD(0x00006C1A, host_ssp, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - */ - - /* 64 bit read only */ - EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - /* - * Not defined in KVM: - * - * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - */ - EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* - * No mask defined in the spec as Hyper-V doesn't currently support - * these. Future proof by resetting the whole clean field mask on - * access. - */ - EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 32 bit rw */ - EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), - EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), - EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), - EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, - vm_entry_exception_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - - /* 32 bit read only */ - EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* No mask defined in the spec (not used) */ - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 16 bit rw */ - EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), -}; -const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); - u64 nested_get_evmptr(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); @@ -608,40 +195,6 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) return 0; } -#if IS_ENABLED(CONFIG_HYPERV) -DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); - -/* - * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption - * is: in case a feature has corresponding fields in eVMCS described and it was - * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a - * feature which has no corresponding eVMCS field, this likely means that KVM - * needs to be updated. - */ -#define evmcs_check_vmcs_conf(field, ctrl) \ - do { \ - typeof(vmcs_conf->field) unsupported; \ - \ - unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \ - if (unsupported) { \ - pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\ - (u64)unsupported); \ - vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \ - } \ - } \ - while (0) - -void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) -{ - evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL); - evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL); - evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC); - evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC); - evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL); - evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL); -} -#endif - int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 9623fe1651c4..a87407412615 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -2,199 +2,89 @@ #ifndef __KVM_X86_VMX_HYPERV_H #define __KVM_X86_VMX_HYPERV_H -#include <linux/jump_label.h> - -#include <asm/hyperv-tlfs.h> -#include <asm/mshyperv.h> -#include <asm/vmx.h> - -#include "../hyperv.h" - -#include "capabilities.h" -#include "vmcs.h" +#include <linux/kvm_host.h> #include "vmcs12.h" +#include "vmx.h" -struct vmcs_config; - -#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) - -#define KVM_EVMCS_VERSION 1 +#define EVMPTR_INVALID (-1ULL) +#define EVMPTR_MAP_PENDING (-2ULL) -struct evmcs_field { - u16 offset; - u16 clean_field; +enum nested_evmptrld_status { + EVMPTRLD_DISABLED, + EVMPTRLD_SUCCEEDED, + EVMPTRLD_VMFAIL, + EVMPTRLD_ERROR, }; -extern const struct evmcs_field vmcs_field_to_evmcs_1[]; -extern const unsigned int nr_evmcs_1_fields; - -static __always_inline int evmcs_field_offset(unsigned long field, - u16 *clean_field) -{ - unsigned int index = ROL16(field, 6); - const struct evmcs_field *evmcs_field; - - if (unlikely(index >= nr_evmcs_1_fields)) - return -ENOENT; - - evmcs_field = &vmcs_field_to_evmcs_1[index]; - - /* - * Use offset=0 to detect holes in eVMCS. This offset belongs to - * 'revision_id' but this field has no encoding and is supposed to - * be accessed directly. - */ - if (unlikely(!evmcs_field->offset)) - return -ENOENT; - - if (clean_field) - *clean_field = evmcs_field->clean_field; - - return evmcs_field->offset; -} - -static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, - unsigned long field, u16 offset) +#ifdef CONFIG_KVM_HYPERV +static inline bool evmptr_is_valid(u64 evmptr) { - /* - * vmcs12_read_any() doesn't care whether the supplied structure - * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes - * the exact offset of the required field, use it for convenience - * here. - */ - return vmcs12_read_any((void *)evmcs, field, offset); + return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING; } -#if IS_ENABLED(CONFIG_HYPERV) - -DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); - -static __always_inline bool kvm_is_using_evmcs(void) +static inline bool nested_vmx_is_evmptr12_valid(struct vcpu_vmx *vmx) { - return static_branch_unlikely(&__kvm_is_using_evmcs); + return evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); } -static __always_inline int get_evmcs_offset(unsigned long field, - u16 *clean_field) +static inline bool evmptr_is_set(u64 evmptr) { - int offset = evmcs_field_offset(field, clean_field); - - WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field); - return offset; + return evmptr != EVMPTR_INVALID; } -static __always_inline void evmcs_write64(unsigned long field, u64 value) +static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx) { - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u64 *)((char *)current_evmcs + offset) = value; - - current_evmcs->hv_clean_fields &= ~clean_field; + return evmptr_is_set(vmx->nested.hv_evmcs_vmptr); } -static __always_inline void evmcs_write32(unsigned long field, u32 value) +static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx) { - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u32 *)((char *)current_evmcs + offset) = value; - current_evmcs->hv_clean_fields &= ~clean_field; + return vmx->nested.hv_evmcs; } -static __always_inline void evmcs_write16(unsigned long field, u16 value) +static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) { - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u16 *)((char *)current_evmcs + offset) = value; - current_evmcs->hv_clean_fields &= ~clean_field; + /* + * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and + * eVMCS has been explicitly enabled by userspace. + */ + return vcpu->arch.hyperv_enabled && + to_vmx(vcpu)->nested.enlightened_vmcs_enabled; } -static __always_inline u64 evmcs_read64(unsigned long field) +u64 nested_get_evmptr(struct kvm_vcpu *vcpu); +uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); +int nested_enable_evmcs(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version); +void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); +int nested_evmcs_check_controls(struct vmcs12 *vmcs12); +bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu); +void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); +#else +static inline bool evmptr_is_valid(u64 evmptr) { - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u64 *)((char *)current_evmcs + offset); + return false; } -static __always_inline u32 evmcs_read32(unsigned long field) +static inline bool nested_vmx_is_evmptr12_valid(struct vcpu_vmx *vmx) { - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u32 *)((char *)current_evmcs + offset); + return false; } -static __always_inline u16 evmcs_read16(unsigned long field) +static inline bool evmptr_is_set(u64 evmptr) { - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u16 *)((char *)current_evmcs + offset); + return false; } -static inline void evmcs_load(u64 phys_addr) +static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx) { - struct hv_vp_assist_page *vp_ap = - hv_get_vp_assist_page(smp_processor_id()); - - if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) - vp_ap->nested_control.features.directhypercall = 1; - vp_ap->current_nested_vmcs = phys_addr; - vp_ap->enlighten_vmentry = 1; + return false; } -void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); -#else /* !IS_ENABLED(CONFIG_HYPERV) */ -static __always_inline bool kvm_is_using_evmcs(void) { return false; } -static __always_inline void evmcs_write64(unsigned long field, u64 value) {} -static __always_inline void evmcs_write32(unsigned long field, u32 value) {} -static __always_inline void evmcs_write16(unsigned long field, u16 value) {} -static __always_inline u64 evmcs_read64(unsigned long field) { return 0; } -static __always_inline u32 evmcs_read32(unsigned long field) { return 0; } -static __always_inline u16 evmcs_read16(unsigned long field) { return 0; } -static inline void evmcs_load(u64 phys_addr) {} -#endif /* IS_ENABLED(CONFIG_HYPERV) */ - -#define EVMPTR_INVALID (-1ULL) -#define EVMPTR_MAP_PENDING (-2ULL) - -static inline bool evmptr_is_valid(u64 evmptr) +static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx) { - return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING; + return NULL; } - -enum nested_evmptrld_status { - EVMPTRLD_DISABLED, - EVMPTRLD_SUCCEEDED, - EVMPTRLD_VMFAIL, - EVMPTRLD_ERROR, -}; - -u64 nested_get_evmptr(struct kvm_vcpu *vcpu); -uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); -int nested_enable_evmcs(struct kvm_vcpu *vcpu, - uint16_t *vmcs_version); -void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); -int nested_evmcs_check_controls(struct vmcs12 *vmcs12); -bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu); -void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); +#endif #endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.c b/arch/x86/kvm/vmx/hyperv_evmcs.c new file mode 100644 index 000000000000..904bfcd1519b --- /dev/null +++ b/arch/x86/kvm/vmx/hyperv_evmcs.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains common code for working with Enlightened VMCS which is + * used both by Hyper-V on KVM and KVM on Hyper-V. + */ + +#include "hyperv_evmcs.h" + +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) +#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ + {EVMCS1_OFFSET(name), clean_field} + +const struct evmcs_field vmcs_field_to_evmcs_1[] = { + /* 64 bit rw */ + EVMCS1_FIELD(GUEST_RIP, guest_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_RSP, guest_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR0, host_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR3, host_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR4, host_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_RIP, host_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), + EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(TSC_OFFSET, tsc_offset, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR0, guest_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR3, guest_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR4, guest_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_DR7, guest_dr7, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_RSP, host_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(EPT_POINTER, ept_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), + EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + /* + * Not used by KVM: + * + * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x0000682A, guest_ssp, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + * EVMCS1_FIELD(0x00006C1A, host_ssp, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + */ + + /* 64 bit read only */ + EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + /* + * Not defined in KVM: + * + * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + */ + EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* + * No mask defined in the spec as Hyper-V doesn't currently support + * these. Future proof by resetting the whole clean field mask on + * access. + */ + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 32 bit rw */ + EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), + EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), + EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), + EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, + vm_entry_exception_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + + /* 32 bit read only */ + EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* No mask defined in the spec (not used) */ + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 16 bit rw */ + EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), +}; +const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h new file mode 100644 index 000000000000..a543fccfc574 --- /dev/null +++ b/arch/x86/kvm/vmx/hyperv_evmcs.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This file contains common definitions for working with Enlightened VMCS which + * are used both by Hyper-V on KVM and KVM on Hyper-V. + */ +#ifndef __KVM_X86_VMX_HYPERV_EVMCS_H +#define __KVM_X86_VMX_HYPERV_EVMCS_H + +#include <asm/hyperv-tlfs.h> + +#include "capabilities.h" +#include "vmcs12.h" + +#define KVM_EVMCS_VERSION 1 + +/* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + * VM_FUNCTION_CONTROL = 0x00002018, + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * + * TSC_MULTIPLIER = 0x00002032, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + * + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +#define EVMCS1_SUPPORTED_PINCTRL \ + (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING | \ + PIN_BASED_VIRTUAL_NMIS) + +#define EVMCS1_SUPPORTED_EXEC_CTRL \ + (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING | \ + CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) + +#define EVMCS1_SUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_ENABLE_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) + +#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE | \ + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_IA32E_MODE | \ + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMFUNC (0) + +struct evmcs_field { + u16 offset; + u16 clean_field; +}; + +extern const struct evmcs_field vmcs_field_to_evmcs_1[]; +extern const unsigned int nr_evmcs_1_fields; + +static __always_inline int evmcs_field_offset(unsigned long field, + u16 *clean_field) +{ + const struct evmcs_field *evmcs_field; + unsigned int index = ROL16(field, 6); + + if (unlikely(index >= nr_evmcs_1_fields)) + return -ENOENT; + + evmcs_field = &vmcs_field_to_evmcs_1[index]; + + /* + * Use offset=0 to detect holes in eVMCS. This offset belongs to + * 'revision_id' but this field has no encoding and is supposed to + * be accessed directly. + */ + if (unlikely(!evmcs_field->offset)) + return -ENOENT; + + if (clean_field) + *clean_field = evmcs_field->clean_field; + + return evmcs_field->offset; +} + +static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, + unsigned long field, u16 offset) +{ + /* + * vmcs12_read_any() doesn't care whether the supplied structure + * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes + * the exact offset of the required field, use it for convenience + * here. + */ + return vmcs12_read_any((void *)evmcs, field, offset); +} + +#endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c5ec0ef51ff7..db0ad1e6ec4b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -179,7 +179,7 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu, * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all * fields and thus must be synced. */ - if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) + if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; return kvm_skip_emulated_instruction(vcpu); @@ -194,7 +194,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) * can't be done if there isn't a current VMCS. */ if (vmx->nested.current_vmptr == INVALID_GPA && - !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + !nested_vmx_is_evmptr12_valid(vmx)) return nested_vmx_failInvalid(vcpu); return nested_vmx_failValid(vcpu, vm_instruction_error); @@ -226,10 +226,11 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) { +#ifdef CONFIG_KVM_HYPERV struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + if (nested_vmx_is_evmptr12_valid(vmx)) { kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); vmx->nested.hv_evmcs = NULL; } @@ -241,6 +242,34 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) hv_vcpu->nested.vm_id = 0; hv_vcpu->nested.vp_id = 0; } +#endif +} + +static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) +{ +#ifdef CONFIG_KVM_HYPERV + struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * When Enlightened VMEntry is enabled on the calling CPU we treat + * memory area pointer by vmptr as Enlightened VMCS (as there's no good + * way to distinguish it from VMCS12) and we must not corrupt it by + * writing to the non-existent 'launch_state' field. The area doesn't + * have to be the currently active EVMCS on the calling CPU and there's + * nothing KVM has to do to transition it from 'active' to 'non-active' + * state. It is possible that the area will stay mapped as + * vmx->nested.hv_evmcs but this shouldn't be a problem. + */ + if (!guest_cpuid_has_evmcs(vcpu) || + !evmptr_is_valid(nested_get_evmptr(vcpu))) + return false; + + if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) + nested_release_evmcs(vcpu); + + return true; +#else + return false; +#endif } static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, @@ -572,7 +601,6 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ @@ -588,10 +616,13 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature * and tells KVM (L0) there were no changes in MSR bitmap for L2. */ - if (!vmx->nested.force_msr_bitmap_recalc && evmcs && - evmcs->hv_enlightenments_control.msr_bitmap && - evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) - return true; + if (!vmx->nested.force_msr_bitmap_recalc) { + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); + + if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && + evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) + return true; + } if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) return false; @@ -1085,7 +1116,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, bool reload_pdptrs, enum vm_entry_failure_code *entry_failure_code) { - if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { + if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -1139,14 +1170,8 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or - * L2's VP_ID upon request from the guest. Make sure we check for - * pending entries in the right FIFO upon L1/L2 transition as these - * requests are put by other vCPUs asynchronously. - */ - if (to_hv_vcpu(vcpu) && enable_ept) - kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + /* Handle pending Hyper-V TLB flush requests */ + kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); /* * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings @@ -1578,8 +1603,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) { +#ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ @@ -1818,12 +1844,16 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields */ return; +#else /* CONFIG_KVM_HYPERV */ + KVM_BUG_ON(1, vmx->vcpu.kvm); +#endif /* CONFIG_KVM_HYPERV */ } static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) { +#ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); /* * Should not be changed by KVM: @@ -1992,6 +2022,9 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; return; +#else /* CONFIG_KVM_HYPERV */ + KVM_BUG_ON(1, vmx->vcpu.kvm); +#endif /* CONFIG_KVM_HYPERV */ } /* @@ -2001,6 +2034,7 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( struct kvm_vcpu *vcpu, bool from_launch) { +#ifdef CONFIG_KVM_HYPERV struct vcpu_vmx *vmx = to_vmx(vcpu); bool evmcs_gpa_changed = false; u64 evmcs_gpa; @@ -2082,13 +2116,16 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( } return EVMPTRLD_SUCCEEDED; +#else + return EVMPTRLD_DISABLED; +#endif } void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) copy_vmcs12_to_enlightened(vmx); else copy_vmcs12_to_shadow(vmx); @@ -2242,7 +2279,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); - if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) prepare_vmcs02_early_rare(vmx, vmcs12); /* @@ -2403,7 +2440,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { - struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { @@ -2535,15 +2572,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, enum vm_entry_failure_code *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); bool load_guest_pdptrs_vmcs12 = false; - if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { prepare_vmcs02_rare(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; - load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || - !(vmx->nested.hv_evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || + !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); } if (vmx->nested.nested_run_pending && @@ -2664,9 +2701,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * bits when it changes a field in eVMCS. Mark all fields as clean * here. */ - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) - vmx->nested.hv_evmcs->hv_clean_fields |= - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + if (nested_vmx_is_evmptr12_valid(vmx)) + evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; return 0; } @@ -2717,7 +2753,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) } /* Reserved bits should not be set */ - if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) + if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ @@ -2888,8 +2924,10 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, nested_check_vm_entry_controls(vcpu, vmcs12)) return -EINVAL; +#ifdef CONFIG_KVM_HYPERV if (guest_cpuid_has_evmcs(vcpu)) return nested_evmcs_check_controls(vmcs12); +#endif return 0; } @@ -2912,7 +2950,7 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || - CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) + CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) return -EINVAL; if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || @@ -3161,6 +3199,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } +#ifdef CONFIG_KVM_HYPERV static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3188,6 +3227,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) return true; } +#endif static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { @@ -3279,6 +3319,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) { +#ifdef CONFIG_KVM_HYPERV /* * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory @@ -3295,6 +3336,7 @@ static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) return false; } +#endif if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) return false; @@ -3538,7 +3580,7 @@ vmentry_fail_vmexit: load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason.full; - if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; } @@ -3569,7 +3611,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu); - if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && + if (CC(!nested_vmx_is_evmptr12_valid(vmx) && vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); @@ -3584,8 +3626,10 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { - copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); + if (nested_vmx_is_evmptr12_valid(vmx)) { + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); + + copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); /* Enlightened VMCS doesn't have launch state */ vmcs12->launch_state = !launch; } else if (enable_shadow_vmcs) { @@ -4329,11 +4373,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = - !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); + !nested_vmx_is_evmptr12_valid(vmx); vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); @@ -4732,6 +4776,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); +#ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { /* * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map @@ -4741,6 +4786,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, */ (void)nested_get_evmcs_page(vcpu); } +#endif /* Service pending TLB flush requests for L2 before switching to L1. */ kvm_service_local_tlb_flush_requests(vcpu); @@ -4854,7 +4900,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, } if ((vm_exit_reason != -1) && - (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) + (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ @@ -4980,6 +5026,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, else *ret = off; + *ret = vmx_get_untagged_addr(vcpu, *ret, 0); /* Long mode: #GP(0)/#SS(0) if the memory address is in a * non-canonical form. This is the only check on the memory * destination for long mode! @@ -5292,18 +5339,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (vmptr == vmx->nested.vmxon_ptr) return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - /* - * When Enlightened VMEntry is enabled on the calling CPU we treat - * memory area pointer by vmptr as Enlightened VMCS (as there's no good - * way to distinguish it from VMCS12) and we must not corrupt it by - * writing to the non-existent 'launch_state' field. The area doesn't - * have to be the currently active EVMCS on the calling CPU and there's - * nothing KVM has to do to transition it from 'active' to 'non-active' - * state. It is possible that the area will stay mapped as - * vmx->nested.hv_evmcs but this shouldn't be a problem. - */ - if (likely(!guest_cpuid_has_evmcs(vcpu) || - !evmptr_is_valid(nested_get_evmptr(vcpu)))) { + if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); @@ -5320,8 +5356,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) vmptr + offsetof(struct vmcs12, launch_state), &zero, sizeof(zero)); - } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { - nested_release_evmcs(vcpu); } return nested_vmx_succeed(vcpu); @@ -5360,7 +5394,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + if (!nested_vmx_is_evmptr12_valid(vmx)) { /* * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMREAD sets the ALU flags for VMfailInvalid. @@ -5398,7 +5432,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* Read the field, zero-extended to a u64 value */ - value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); + value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); } /* @@ -5586,7 +5620,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); /* Forbid normal VMPTRLD if Enlightened version was used */ - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) return 1; if (vmx->nested.current_vmptr != vmptr) { @@ -5649,7 +5683,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) + if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) return 1; if (get_vmx_mem_address(vcpu, exit_qual, instr_info, @@ -5797,6 +5831,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + /* + * LAM doesn't apply to addresses that are inputs to TLB + * invalidation. + */ if (!operand.vpid || is_noncanonical_address(operand.gla, vcpu)) return nested_vmx_fail(vcpu, @@ -6208,11 +6246,13 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, * Handle L2's bus locks in L0 directly. */ return true; +#ifdef CONFIG_KVM_HYPERV case EXIT_REASON_VMCALL: /* Hyper-V L2 TLB flush hypercall is handled by L0 */ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && nested_evmcs_l2_tlb_flush_enabled(vcpu) && kvm_hv_is_tlb_flush_hcall(vcpu); +#endif default: break; } @@ -6435,7 +6475,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ - if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) + if (nested_vmx_is_evmptr12_set(vmx)) kvm_state.flags |= KVM_STATE_NESTED_EVMCS; if (is_guest_mode(vcpu) && @@ -6491,7 +6531,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, } else { copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); if (!vmx->nested.need_vmcs12_to_shadow_sync) { - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) /* * L1 hypervisor is not obliged to keep eVMCS * clean fields data always up-to-date while @@ -6632,6 +6672,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); +#ifdef CONFIG_KVM_HYPERV } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { /* * nested_vmx_handle_enlightened_vmptrld() cannot be called @@ -6641,6 +6682,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, */ vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); +#endif } else { return -EINVAL; } @@ -7096,7 +7138,9 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .set_state = vmx_set_nested_state, .get_nested_state_pages = vmx_get_nested_state_pages, .write_log_dirty = nested_vmx_write_pml_buffer, +#ifdef CONFIG_KVM_HYPERV .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, +#endif }; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index b4b9d51438c6..cce4e2aa30fb 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -3,6 +3,7 @@ #define __KVM_X86_VMX_NESTED_H #include "kvm_cache_regs.h" +#include "hyperv.h" #include "vmcs12.h" #include "vmx.h" @@ -57,7 +58,7 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ return vmx->nested.current_vmptr != -1ull || - vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID; + nested_vmx_is_evmptr12_set(vmx); } static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 820d3e1f6b4f..a6216c874729 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -437,11 +437,9 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc_write_counter(pmc, data); - pmc_update_sample_period(pmc); break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc_write_counter(pmc, data); - pmc_update_sample_period(pmc); break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { reserved_bits = pmu->reserved_bits; @@ -632,26 +630,6 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) static void intel_pmu_reset(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc = NULL; - int i; - - for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { - pmc = &pmu->gp_counters[i]; - - pmc_stop_counter(pmc); - pmc->counter = pmc->prev_counter = pmc->eventsel = 0; - } - - for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { - pmc = &pmu->fixed_counters[i]; - - pmc_stop_counter(pmc); - pmc->counter = pmc->prev_counter = 0; - } - - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; - intel_pmu_release_guest_lbr_event(vcpu); } diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 3e822e582497..6fef01e0536e 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -37,6 +37,7 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, if (!IS_ALIGNED(*gva, alignment)) { fault = true; } else if (likely(is_64_bit_mode(vcpu))) { + *gva = vmx_get_untagged_addr(vcpu, *gva, 0); fault = is_noncanonical_address(*gva, vcpu); } else { *gva &= 0xffffffff; diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index be275a0410a8..906ecd001511 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -289,7 +289,7 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) RET .Lfixup: - cmpb $0, kvm_rebooting + cmpb $0, _ASM_RIP(kvm_rebooting) jne .Lvmfail ud2 .Lvmfail: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 40e3780d73ae..d21f55f323ea 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -66,6 +66,7 @@ #include "vmx.h" #include "x86.h" #include "smm.h" +#include "vmx_onhyperv.h" MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -523,22 +524,14 @@ module_param(enlightened_vmcs, bool, 0444); static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_enlightened_vmcs *evmcs; - struct hv_partition_assist_pg **p_hv_pa_pg = - &to_kvm_hv(vcpu->kvm)->hv_pa_pg; - /* - * Synthetic VM-Exit is not enabled in current code and so All - * evmcs in singe VM shares same assist page. - */ - if (!*p_hv_pa_pg) - *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); + hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); - if (!*p_hv_pa_pg) + if (partition_assist_page == INVALID_PAGE) return -ENOMEM; evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; - evmcs->partition_assist_page = - __pa(*p_hv_pa_pg); + evmcs->partition_assist_page = partition_assist_page; evmcs->hv_vm_id = (unsigned long)vcpu->kvm; evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; @@ -2055,6 +2048,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data)) return 1; +#ifdef CONFIG_KVM_HYPERV /* * Enlightened VMCS v1 doesn't have certain VMCS fields but * instead of just ignoring the features, different Hyper-V @@ -2065,6 +2059,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu)) nested_evmcs_filter_control_msr(vcpu, msr_info->index, &msr_info->data); +#endif break; case MSR_IA32_RTIT_CTL: if (!vmx_pt_mode_is_host_guest()) @@ -3400,7 +3395,8 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { - guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu); + guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | + kvm_get_active_cr3_lam_bits(vcpu); } if (update_guest_cr3) @@ -4833,7 +4829,10 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->nested.posted_intr_nv = -1; vmx->nested.vmxon_ptr = INVALID_GPA; vmx->nested.current_vmptr = INVALID_GPA; + +#ifdef CONFIG_KVM_HYPERV vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; +#endif vcpu->arch.microcode_version = 0x100000000ULL; vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; @@ -5782,7 +5781,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * would also use advanced VM-exit information for EPT violations to * reconstruct the page fault error code. */ - if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa))) + if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); @@ -7673,6 +7672,9 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); + entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); + cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); + #undef cr4_fixed1_update } @@ -7759,6 +7761,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM); vmx_setup_uret_msrs(vmx); @@ -8205,6 +8208,50 @@ static void vmx_vm_destroy(struct kvm *kvm) free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); } +/* + * Note, the SDM states that the linear address is masked *after* the modified + * canonicality check, whereas KVM masks (untags) the address and then performs + * a "normal" canonicality check. Functionally, the two methods are identical, + * and when the masking occurs relative to the canonicality check isn't visible + * to software, i.e. KVM's behavior doesn't violate the SDM. + */ +gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) +{ + int lam_bit; + unsigned long cr3_bits; + + if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) + return gva; + + if (!is_64_bit_mode(vcpu)) + return gva; + + /* + * Bit 63 determines if the address should be treated as user address + * or a supervisor address. + */ + if (!(gva & BIT_ULL(63))) { + cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); + if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) + return gva; + + /* LAM_U48 is ignored if LAM_U57 is set. */ + lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; + } else { + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) + return gva; + + lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; + } + + /* + * Untag the address by sign-extending the lam_bit, but NOT to bit 63. + * Bit 63 is retained from the raw virtual address so that untagging + * doesn't change a user access to a supervisor access, and vice versa. + */ + return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); +} + static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = KBUILD_MODNAME, @@ -8345,6 +8392,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .complete_emulated_msr = kvm_complete_insn_gp, .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, + + .get_untagged_addr = vmx_get_untagged_addr, }; static unsigned int vmx_handle_intel_pt_intr(void) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index c2130d2c8e24..e3b0985bb74a 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -241,9 +241,11 @@ struct nested_vmx { bool guest_mode; } smm; +#ifdef CONFIG_KVM_HYPERV gpa_t hv_evmcs_vmptr; struct kvm_host_map hv_evmcs_map; struct hv_enlightened_vmcs *hv_evmcs; +#endif }; struct vcpu_vmx { @@ -420,6 +422,8 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); +gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); + static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool value) { @@ -745,14 +749,4 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && enable_ipiv; } -static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) -{ - /* - * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and - * eVMCS has been explicitly enabled by userspace. - */ - return vcpu->arch.hyperv_enabled && - to_vmx(vcpu)->nested.enlightened_vmcs_enabled; -} - #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.c b/arch/x86/kvm/vmx/vmx_onhyperv.c new file mode 100644 index 000000000000..b9a8b91166d0 --- /dev/null +++ b/arch/x86/kvm/vmx/vmx_onhyperv.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "capabilities.h" +#include "vmx_onhyperv.h" + +DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); + +/* + * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption + * is: in case a feature has corresponding fields in eVMCS described and it was + * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a + * feature which has no corresponding eVMCS field, this likely means that KVM + * needs to be updated. + */ +#define evmcs_check_vmcs_conf(field, ctrl) \ + do { \ + typeof(vmcs_conf->field) unsupported; \ + \ + unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \ + if (unsupported) { \ + pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\ + (u64)unsupported); \ + vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \ + } \ + } \ + while (0) + +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +{ + evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL); + evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL); + evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC); + evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC); + evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL); + evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL); +} diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h new file mode 100644 index 000000000000..eb48153bfd73 --- /dev/null +++ b/arch/x86/kvm/vmx/vmx_onhyperv.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ARCH_X86_KVM_VMX_ONHYPERV_H__ +#define __ARCH_X86_KVM_VMX_ONHYPERV_H__ + +#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> + +#include <linux/jump_label.h> + +#include "capabilities.h" +#include "hyperv_evmcs.h" +#include "vmcs12.h" + +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) + +#if IS_ENABLED(CONFIG_HYPERV) + +DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); + +static __always_inline bool kvm_is_using_evmcs(void) +{ + return static_branch_unlikely(&__kvm_is_using_evmcs); +} + +static __always_inline int get_evmcs_offset(unsigned long field, + u16 *clean_field) +{ + int offset = evmcs_field_offset(field, clean_field); + + WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field); + return offset; +} + +static __always_inline void evmcs_write64(unsigned long field, u64 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u64 *)((char *)current_evmcs + offset) = value; + + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static __always_inline void evmcs_write32(unsigned long field, u32 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u32 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static __always_inline void evmcs_write16(unsigned long field, u16 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u16 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static __always_inline u64 evmcs_read64(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u64 *)((char *)current_evmcs + offset); +} + +static __always_inline u32 evmcs_read32(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u32 *)((char *)current_evmcs + offset); +} + +static __always_inline u16 evmcs_read16(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u16 *)((char *)current_evmcs + offset); +} + +static inline void evmcs_load(u64 phys_addr) +{ + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(smp_processor_id()); + + if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) + vp_ap->nested_control.features.directhypercall = 1; + vp_ap->current_nested_vmcs = phys_addr; + vp_ap->enlighten_vmentry = 1; +} + +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); +#else /* !IS_ENABLED(CONFIG_HYPERV) */ +static __always_inline bool kvm_is_using_evmcs(void) { return false; } +static __always_inline void evmcs_write64(unsigned long field, u64 value) {} +static __always_inline void evmcs_write32(unsigned long field, u32 value) {} +static __always_inline void evmcs_write16(unsigned long field, u16 value) {} +static __always_inline u64 evmcs_read64(unsigned long field) { return 0; } +static __always_inline u32 evmcs_read32(unsigned long field) { return 0; } +static __always_inline u16 evmcs_read16(unsigned long field) { return 0; } +static inline void evmcs_load(u64 phys_addr) {} +#endif /* IS_ENABLED(CONFIG_HYPERV) */ + +#endif /* __ARCH_X86_KVM_VMX_ONHYPERV_H__ */ diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 33af7b4c6eb4..f41ce3c24123 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -6,7 +6,7 @@ #include <asm/vmx.h> -#include "hyperv.h" +#include "vmx_onhyperv.h" #include "vmcs.h" #include "../x86.h" diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d0772b47041..0366eca119d7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1284,7 +1284,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) * stuff CR3, e.g. for RSM emulation, and there is no guarantee that * the current vCPU mode is accurate. */ - if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) + if (!kvm_vcpu_is_legal_cr3(vcpu, cr3)) return 1; if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3)) @@ -1504,6 +1504,8 @@ static unsigned num_msrs_to_save; static const u32 emulated_msrs_all[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + +#ifdef CONFIG_KVM_HYPERV HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, @@ -1521,6 +1523,7 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, HV_X64_MSR_SYNDBG_PENDING_BUFFER, +#endif MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, @@ -2510,26 +2513,29 @@ static inline int gtod_is_based_on_tsc(int mode) } #endif -static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu, bool new_generation) { #ifdef CONFIG_X86_64 - bool vcpus_matched; struct kvm_arch *ka = &vcpu->kvm->arch; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == - atomic_read(&vcpu->kvm->online_vcpus)); + /* + * To use the masterclock, the host clocksource must be based on TSC + * and all vCPUs must have matching TSCs. Note, the count for matching + * vCPUs doesn't include the reference vCPU, hence "+1". + */ + bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&vcpu->kvm->online_vcpus)) && + gtod_is_based_on_tsc(gtod->clock.vclock_mode); /* - * Once the masterclock is enabled, always perform request in - * order to update it. - * - * In order to enable masterclock, the host clocksource must be TSC - * and the vcpus need to have matched TSCs. When that happens, - * perform request to enable masterclock. + * Request a masterclock update if the masterclock needs to be toggled + * on/off, or when starting a new generation and the masterclock is + * enabled (compute_guest_tsc() requires the masterclock snapshot to be + * taken _after_ the new generation is created). */ - if (ka->use_master_clock || - (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched)) + if ((ka->use_master_clock && new_generation) || + (ka->use_master_clock != use_master_clock)) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, @@ -2706,7 +2712,7 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - kvm_track_tsc_matching(vcpu); + kvm_track_tsc_matching(vcpu, !matched); } static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) @@ -4020,6 +4026,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * the need to ignore the workaround. */ break; +#ifdef CONFIG_KVM_HYPERV case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: @@ -4032,6 +4039,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); +#endif case MSR_IA32_BBL_CR_CTL3: /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. @@ -4377,6 +4385,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ msr_info->data = 0x20000000; break; +#ifdef CONFIG_KVM_HYPERV case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: @@ -4390,6 +4399,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); +#endif case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow @@ -4527,6 +4537,7 @@ static inline bool kvm_can_mwait_in_guest(void) boot_cpu_has(X86_FEATURE_ARAT); } +#ifdef CONFIG_KVM_HYPERV static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 __user *cpuid_arg) { @@ -4547,6 +4558,7 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, return 0; } +#endif static bool kvm_is_vm_type_supported(unsigned long type) { @@ -4580,9 +4592,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_VCPU_EVENTS: +#ifdef CONFIG_KVM_HYPERV case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_HYPERV_TIME: case KVM_CAP_HYPERV_SYNIC: case KVM_CAP_HYPERV_SYNIC2: case KVM_CAP_HYPERV_VP_INDEX: @@ -4592,6 +4606,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_CPUID: case KVM_CAP_HYPERV_ENFORCE_CPUID: case KVM_CAP_SYS_HYPERV_CPUID: +#endif case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -4601,7 +4616,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: - case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_DISABLE_QUIRKS: @@ -4712,12 +4726,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = kvm_x86_ops.nested_ops->get_state ? kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; +#ifdef CONFIG_KVM_HYPERV case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: r = kvm_x86_ops.enable_l2_tlb_flush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; +#endif case KVM_CAP_SMALLER_MAXPHYADDR: r = (int) allow_smaller_maxphyaddr; break; @@ -4884,9 +4900,11 @@ long kvm_arch_dev_ioctl(struct file *filp, case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; +#ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); break; +#endif case KVM_GET_DEVICE_ATTR: { struct kvm_device_attr attr; r = -EFAULT; @@ -5531,8 +5549,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, - sizeof(guest_xsave->region)); + kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, + sizeof(guest_xsave->region)); } static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, @@ -5712,14 +5730,11 @@ static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap) { - int r; - uint16_t vmcs_version; - void __user *user_ptr; - if (cap->flags) return -EINVAL; switch (cap->cap) { +#ifdef CONFIG_KVM_HYPERV case KVM_CAP_HYPERV_SYNIC2: if (cap->args[0]) return -EINVAL; @@ -5731,16 +5746,22 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - if (!kvm_x86_ops.nested_ops->enable_evmcs) - return -ENOTTY; - r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); - if (!r) { - user_ptr = (void __user *)(uintptr_t)cap->args[0]; - if (copy_to_user(user_ptr, &vmcs_version, - sizeof(vmcs_version))) - r = -EFAULT; + { + int r; + uint16_t vmcs_version; + void __user *user_ptr; + + if (!kvm_x86_ops.nested_ops->enable_evmcs) + return -ENOTTY; + r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); + if (!r) { + user_ptr = (void __user *)(uintptr_t)cap->args[0]; + if (copy_to_user(user_ptr, &vmcs_version, + sizeof(vmcs_version))) + r = -EFAULT; + } + return r; } - return r; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: if (!kvm_x86_ops.enable_l2_tlb_flush) return -ENOTTY; @@ -5749,6 +5770,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_HYPERV_ENFORCE_CPUID: return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]); +#endif case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; @@ -6141,9 +6163,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } +#ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp); break; +#endif #ifdef CONFIG_KVM_XEN case KVM_XEN_VCPU_GET_ATTR: { struct kvm_xen_vcpu_attr xva; @@ -7201,6 +7225,7 @@ set_pit2_out: r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, ®ion); break; } +#ifdef CONFIG_KVM_HYPERV case KVM_HYPERV_EVENTFD: { struct kvm_hyperv_eventfd hvevfd; @@ -7210,6 +7235,7 @@ set_pit2_out: r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); break; } +#endif case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; @@ -8445,6 +8471,15 @@ static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt) kvm_vm_bugged(kvm); } +static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt, + gva_t addr, unsigned int flags) +{ + if (!kvm_x86_ops.get_untagged_addr) + return addr; + + return static_call(kvm_x86_get_untagged_addr)(emul_to_vcpu(ctxt), addr, flags); +} + static const struct x86_emulate_ops emulate_ops = { .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, @@ -8489,6 +8524,7 @@ static const struct x86_emulate_ops emulate_ops = { .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, + .get_untagged_addr = emulator_get_untagged_addr, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -10588,19 +10624,20 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) { - u64 eoi_exit_bitmap[4]; - if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; +#ifdef CONFIG_KVM_HYPERV if (to_hv_vcpu(vcpu)) { + u64 eoi_exit_bitmap[4]; + bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, to_hv_synic(vcpu)->vec_bitmap, 256); static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); return; } - +#endif static_call_cond(kvm_x86_load_eoi_exitmap)( vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } @@ -10691,9 +10728,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * the flushes are considered "remote" and not "local" because * the requests can be initiated from other vCPUs. */ +#ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) && kvm_hv_vcpu_flush_tlb(vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); +#endif if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; @@ -10746,6 +10785,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu_load_eoi_exitmap(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) kvm_vcpu_reload_apic_access_page(vcpu); +#ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; @@ -10776,6 +10816,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) kvm_hv_process_stimers(vcpu); +#endif if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) kvm_vcpu_update_apicv(vcpu); if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) @@ -11612,7 +11653,7 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) */ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) return false; - if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3)) + if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3)) return false; } else { /* @@ -12221,7 +12262,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) } if (!init_event) { - kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; vcpu->arch.msr_misc_features_enables = 0; @@ -12438,7 +12478,9 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_free_vm(struct kvm *kvm) { - kfree(to_kvm_hv(kvm)->hv_pa_pg); +#if IS_ENABLED(CONFIG_HYPERV) + kfree(kvm->arch.hv_pa_pg); +#endif __kvm_arch_free_vm(kvm); } @@ -13051,7 +13093,10 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return true; - return vcpu->arch.preempted_in_kernel; + if (vcpu != kvm_get_running_vcpu()) + return vcpu->arch.preempted_in_kernel; + + return static_call(kvm_x86_get_cpl)(vcpu) == 0; } unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) @@ -13553,6 +13598,10 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) switch (type) { case INVPCID_TYPE_INDIV_ADDR: + /* + * LAM doesn't apply to addresses that are inputs to TLB + * invalidation. + */ if ((!pcid_enabled && (operand.pcid != 0)) || is_noncanonical_address(operand.gla, vcpu)) { kvm_inject_gp(vcpu, 0); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5184fde1dc54..2f7e19166658 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -530,6 +530,8 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); __reserved_bits |= X86_CR4_VMXE; \ if (!__cpu_has(__c, X86_FEATURE_PCID)) \ __reserved_bits |= X86_CR4_PCIDE; \ + if (!__cpu_has(__c, X86_FEATURE_LAM)) \ + __reserved_bits |= X86_CR4_LAM_SUP; \ __reserved_bits; \ }) diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index a68f2dda0948..70b91de2e053 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -32,6 +32,7 @@ #include <asm/msr.h> #include <asm/cmdline.h> #include <asm/sev.h> +#include <asm/ia32.h> #include "mm_internal.h" @@ -481,6 +482,16 @@ void __init sme_early_init(void) */ if (sev_status & MSR_AMD64_SEV_ES_ENABLED) x86_cpuinit.parallel_bringup = false; + + /* + * The VMM is capable of injecting interrupt 0x80 and triggering the + * compatibility syscall path. + * + * By default, the 32-bit emulation is disabled in order to ensure + * the safety of the VM. + */ + if (sev_status & MSR_AMD64_SEV_ENABLED) + ia32_disable(); } void __init mem_encrypt_free_decrypted_mem(void) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8c10d9abc239..e89e415aa743 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3025,3 +3025,49 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp #endif WARN(1, "verification of programs using bpf_throw should have failed\n"); } + +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + u8 *old_addr, *new_addr, *old_bypass_addr; + int ret; + + old_bypass_addr = old ? NULL : poke->bypass_addr; + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; + + /* + * On program loading or teardown, the program's kallsym entry + * might not be in place, so we use __bpf_arch_text_poke to skip + * the kallsyms check. + */ + if (new) { + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, new_addr); + BUG_ON(ret < 0); + if (!old) { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + poke->bypass_addr, + NULL); + BUG_ON(ret < 0); + } + } else { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + old_bypass_addr, + poke->bypass_addr); + BUG_ON(ret < 0); + /* let other CPUs finish the execution of program + * so that it will not possible to expose them + * to invalid nop, stack unwind, nop state + */ + if (!ret) + synchronize_rcu(); + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, NULL); + BUG_ON(ret < 0); + } +} diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 9b1ec5d8c99c..a65fc2ae15b4 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -9,6 +9,7 @@ config XEN select PARAVIRT_CLOCK select X86_HV_CALLBACK_VECTOR depends on X86_64 || (X86_32 && X86_PAE) + depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8) depends on X86_LOCAL_APIC && X86_TSC help This is the Linux Xen port. Enabling this will allow the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0337392a3121..3c61bb98c10e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -33,9 +33,12 @@ EXPORT_SYMBOL_GPL(hypercall_page); * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info * but during boot it is switched to point to xen_vcpu_info. * The pointer is used in xen_evtchn_do_upcall to acknowledge pending events. + * Make sure that xen_vcpu_info doesn't cross a page boundary by making it + * cache-line aligned (the struct is guaranteed to have a size of 64 bytes, + * which matches the cache line size of 64-bit x86 processors). */ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); -DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DEFINE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info); /* Linux <-> Xen vCPU id mapping */ DEFINE_PER_CPU(uint32_t, xen_vcpu_id); @@ -160,6 +163,7 @@ void xen_vcpu_setup(int cpu) int err; struct vcpu_info *vcpup; + BUILD_BUG_ON(sizeof(*vcpup) > SMP_CACHE_BYTES); BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); /* diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index bbbfdd495ebd..aeb33e0a3f76 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -704,7 +704,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION - { entry_INT80_compat, xen_entry_INT80_compat, false }, + TRAP_ENTRY(int80_emulation, false ), #endif TRAP_ENTRY(exc_page_fault, false ), TRAP_ENTRY(exc_divide_error, false ), diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 9e5e68008785..1a9cd18dfbd3 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -156,7 +156,7 @@ xen_pv_trap asm_xenpv_exc_machine_check #endif /* CONFIG_X86_MCE */ xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION -xen_pv_trap entry_INT80_compat +xen_pv_trap asm_int80_emulation #endif xen_pv_trap asm_exc_xen_unknown_trap xen_pv_trap asm_exc_xen_hypervisor_callback diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 408a2aa66c69..a87ab36889e7 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -21,7 +21,7 @@ extern void *xen_initial_gdt; struct trap_info; void xen_copy_trap_info(struct trap_info *traps); -DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DECLARE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info); DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); |