diff options
Diffstat (limited to 'arch/x86')
32 files changed, 412 insertions, 227 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6d1879ef933a..67745ceab0db 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1973,7 +1973,6 @@ config EFI config EFI_STUB bool "EFI stub support" depends on EFI - depends on $(cc-option,-mabi=ms) || X86_32 select RELOCATABLE help This kernel feature allows a bzImage to be loaded directly diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 928dcf7a20d9..b8998cf0508a 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -34,6 +34,8 @@ #define VE_GET_PORT_NUM(e) ((e) >> 16) #define VE_IS_IO_STRING(e) ((e) & BIT(4)) +#define ATTR_SEPT_VE_DISABLE BIT(28) + /* * Wrapper for standard use of __tdx_hypercall with no output aside from * return code. @@ -98,10 +100,11 @@ static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, panic("TDCALL %lld failed (Buggy TDX module!)\n", fn); } -static u64 get_cc_mask(void) +static void tdx_parse_tdinfo(u64 *cc_mask) { struct tdx_module_output out; unsigned int gpa_width; + u64 td_attr; /* * TDINFO TDX module call is used to get the TD execution environment @@ -109,19 +112,27 @@ static u64 get_cc_mask(void) * information, etc. More details about the ABI can be found in TDX * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL * [TDG.VP.INFO]. + */ + tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out); + + /* + * The highest bit of a guest physical address is the "sharing" bit. + * Set it for shared pages and clear it for private pages. * * The GPA width that comes out of this call is critical. TDX guests * can not meaningfully run without it. */ - tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out); - gpa_width = out.rcx & GENMASK(5, 0); + *cc_mask = BIT_ULL(gpa_width - 1); /* - * The highest bit of a guest physical address is the "sharing" bit. - * Set it for shared pages and clear it for private pages. + * The kernel can not handle #VE's when accessing normal kernel + * memory. Ensure that no #VE will be delivered for accesses to + * TD-private memory. Only VMM-shared memory (MMIO) will #VE. */ - return BIT_ULL(gpa_width - 1); + td_attr = out.rdx; + if (!(td_attr & ATTR_SEPT_VE_DISABLE)) + panic("TD misconfiguration: SEPT_VE_DISABLE attibute must be set.\n"); } /* @@ -758,7 +769,7 @@ void __init tdx_early_init(void) setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); cc_set_vendor(CC_VENDOR_INTEL); - cc_mask = get_cc_mask(); + tdx_parse_tdinfo(&cc_mask); cc_set_mask(cc_mask); /* diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c index b7664d018851..8fa58b0f3cb3 100644 --- a/arch/x86/crypto/polyval-clmulni_glue.c +++ b/arch/x86/crypto/polyval-clmulni_glue.c @@ -27,13 +27,17 @@ #include <asm/cpu_device_id.h> #include <asm/simd.h> +#define POLYVAL_ALIGN 16 +#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN) +#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) +#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA) #define NUM_KEY_POWERS 8 struct polyval_tfm_ctx { /* * These powers must be in the order h^8, ..., h^1. */ - u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE]; + u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR; }; struct polyval_desc_ctx { @@ -45,6 +49,11 @@ asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, const u8 *in, size_t nblocks, u8 *accumulator); asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2); +static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm) +{ + return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN); +} + static void internal_polyval_update(const struct polyval_tfm_ctx *keys, const u8 *in, size_t nblocks, u8 *accumulator) { @@ -72,7 +81,7 @@ static void internal_polyval_mul(u8 *op1, const u8 *op2) static int polyval_x86_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { - struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm); + struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm); int i; if (keylen != POLYVAL_BLOCK_SIZE) @@ -102,7 +111,7 @@ static int polyval_x86_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); u8 *pos; unsigned int nblocks; unsigned int n; @@ -143,7 +152,7 @@ static int polyval_x86_update(struct shash_desc *desc, static int polyval_x86_final(struct shash_desc *desc, u8 *dst) { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); if (dctx->bytes) { internal_polyval_mul(dctx->buffer, @@ -167,7 +176,7 @@ static struct shash_alg polyval_alg = { .cra_driver_name = "polyval-clmulni", .cra_priority = 200, .cra_blocksize = POLYVAL_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct polyval_tfm_ctx), + .cra_ctxsize = POLYVAL_CTX_SIZE, .cra_module = THIS_MODULE, }, }; diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 3271735f0070..4cb710efbdd9 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -801,7 +801,7 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, /* Extension Memory */ if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) { - data_src->mem_lvl_num = PERF_MEM_LVLNUM_EXTN_MEM; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL; if (op_data2->rmt_node) { data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; /* IBS doesn't provide Remote socket detail */ diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a646a5f9a235..1b92bf05fd65 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4911,6 +4911,7 @@ static const struct x86_cpu_desc isolation_ucodes[] = { INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 6, 0x00000000), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 7, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 11, 0x00000000), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c), INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e), diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7839507b3844..446d2833efa7 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -982,8 +982,13 @@ struct event_constraint intel_icl_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), /* SLOTS */ INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf), /* MEM_INST_RETIRED.LOAD */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf), /* MEM_INST_RETIRED.STORE */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */ @@ -1004,8 +1009,13 @@ struct event_constraint intel_spr_pebs_event_constraints[] = { INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe), INTEL_PLD_CONSTRAINT(0x1cd, 0xfe), INTEL_PSD_CONSTRAINT(0x2cd, 0x1), - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf), - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf), + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4fce1a4226e3..8259d725054d 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1596,7 +1596,7 @@ void __init intel_pmu_arch_lbr_init(void) return; clear_arch_lbr: - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR); + setup_clear_cpu_cap(X86_FEATURE_ARCH_LBR); } /** diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 77e3a47af5ad..a829492bca4c 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -619,12 +619,8 @@ static int rapl_check_hw_unit(struct rapl_model *rm) case RAPL_UNIT_QUIRK_INTEL_HSW: rapl_hw_unit[PERF_RAPL_RAM] = 16; break; - /* - * SPR shares the same DRAM domain energy unit as HSW, plus it - * also has a fixed energy unit for Psys domain. - */ + /* SPR uses a fixed energy unit for Psys domain. */ case RAPL_UNIT_QUIRK_INTEL_SPR: - rapl_hw_unit[PERF_RAPL_RAM] = 16; rapl_hw_unit[PERF_RAPL_PSYS] = 0; break; default: @@ -806,7 +802,11 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 5d75fe229342..347707d459c6 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -107,6 +107,11 @@ #define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ +#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF + +#define INTEL_FAM6_GRANITERAPIDS_X 0xAD +#define INTEL_FAM6_GRANITERAPIDS_D 0xAE + #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ #define INTEL_FAM6_ALDERLAKE_N 0xBE @@ -118,7 +123,7 @@ #define INTEL_FAM6_METEORLAKE 0xAC #define INTEL_FAM6_METEORLAKE_L 0xAA -/* "Small Core" Processors (Atom) */ +/* "Small Core" Processors (Atom/E-Core) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ #define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ @@ -145,6 +150,10 @@ #define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ #define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ +#define INTEL_FAM6_SIERRAFOREST_X 0xAF + +#define INTEL_FAM6_GRANDRIDGE 0xB6 + /* Xeon Phi */ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 0bef44d30a27..2fd52b65deac 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -25,8 +25,10 @@ arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) { u64 start = rmrr->base_address; u64 end = rmrr->end_address + 1; + int entry_type; - if (e820__mapped_all(start, end, E820_TYPE_RESERVED)) + entry_type = e820__get_entry_type(start, end); + if (entry_type == E820_TYPE_RESERVED || entry_type == E820_TYPE_NVS) return 0; pr_err(FW_BUG "No firmware reserved region can cover this RMRR [%#018Lx-%#018Lx], contact BIOS vendor for fixes\n", diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 3b87d889b6e1..888731ccf1f6 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -10,10 +10,13 @@ /* Even with __builtin_ the compiler may decide to use the out of line function. */ +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) +#include <linux/kmsan_string.h> +#endif + #define __HAVE_ARCH_MEMCPY 1 -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memcpy -void *__msan_memcpy(void *dst, const void *src, size_t size); #define memcpy __msan_memcpy #else extern void *memcpy(void *to, const void *from, size_t len); @@ -21,7 +24,7 @@ extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) extern void *__msan_memset(void *s, int c, size_t n); #undef memset #define memset __msan_memset @@ -67,7 +70,7 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) } #define __HAVE_ARCH_MEMMOVE -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memmove void *__msan_memmove(void *dest, const void *src, size_t len); #define memmove __msan_memmove diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index 59358d1bf880..fd2669b1cb2d 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -6,7 +6,7 @@ #ifndef _ASM_X86_SYSCALL_WRAPPER_H #define _ASM_X86_SYSCALL_WRAPPER_H -struct pt_regs; +#include <asm/ptrace.h> extern long __x64_sys_ni_syscall(const struct pt_regs *regs); extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8bc614cfe21b..1cc756eafa44 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -254,24 +254,25 @@ extern void __put_user_nocheck_8(void); #define __put_user_size(x, ptr, size, label) \ do { \ __typeof__(*(ptr)) __x = (x); /* eval x once */ \ - __chk_user_ptr(ptr); \ + __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ + __chk_user_ptr(__ptr); \ switch (size) { \ case 1: \ - __put_user_goto(__x, ptr, "b", "iq", label); \ + __put_user_goto(__x, __ptr, "b", "iq", label); \ break; \ case 2: \ - __put_user_goto(__x, ptr, "w", "ir", label); \ + __put_user_goto(__x, __ptr, "w", "ir", label); \ break; \ case 4: \ - __put_user_goto(__x, ptr, "l", "ir", label); \ + __put_user_goto(__x, __ptr, "l", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64(__x, ptr, label); \ + __put_user_goto_u64(__x, __ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ - instrument_put_user(__x, ptr, size); \ + instrument_put_user(__x, __ptr, size); \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index e7410e98fc1f..3a35dec3ec55 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -440,7 +440,13 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p return ret; native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev >= mc->hdr.patch_id) + + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (rev > mc->hdr.patch_id) return ret; if (!__apply_microcode_amd(mc)) { @@ -528,8 +534,12 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* Check whether we have saved a new patch already: */ - if (*new_rev && rev < mc->hdr.patch_id) { + /* + * Check whether a new patch has been saved already. Also, allow application of + * the same revision in order to pick up SMT-thread-specific configuration even + * if the sibling SMT thread already has an up-to-date revision. + */ + if (*new_rev && rev <= mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { *new_rev = mc->hdr.patch_id; return; diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index de62b0b87ced..3266ea36667c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -66,9 +66,6 @@ struct rdt_hw_resource rdt_resources_all[] = { .rid = RDT_RESOURCE_L3, .name = "L3", .cache_level = 3, - .cache = { - .min_cbm_bits = 1, - }, .domains = domain_init(RDT_RESOURCE_L3), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -83,9 +80,6 @@ struct rdt_hw_resource rdt_resources_all[] = { .rid = RDT_RESOURCE_L2, .name = "L2", .cache_level = 2, - .cache = { - .min_cbm_bits = 1, - }, .domains = domain_init(RDT_RESOURCE_L2), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -836,6 +830,7 @@ static __init void rdt_init_res_defs_intel(void) r->cache.arch_has_sparse_bitmaps = false; r->cache.arch_has_empty_bitmaps = false; r->cache.arch_has_per_cpu_cfg = false; + r->cache.min_cbm_bits = 1; } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE; hw_res->msr_update = mba_wrmsr_intel; @@ -856,6 +851,7 @@ static __init void rdt_init_res_defs_amd(void) r->cache.arch_has_sparse_bitmaps = true; r->cache.arch_has_empty_bitmaps = true; r->cache.arch_has_per_cpu_cfg = true; + r->cache.min_cbm_bits = 0; } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_BW_BASE; hw_res->msr_update = mba_wrmsr_amd; diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 132a2de44d2f..5e868b62a7c4 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -96,6 +96,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; unsigned int core_select_mask, core_level_siblings; unsigned int die_select_mask, die_level_siblings; + unsigned int pkg_mask_width; bool die_level_present = false; int leaf; @@ -111,10 +112,10 @@ int detect_extended_topology(struct cpuinfo_x86 *c) core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; - do { + while (true) { cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); /* @@ -132,10 +133,15 @@ int detect_extended_topology(struct cpuinfo_x86 *c) die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } + if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) + pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + else + break; + sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + } - core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; die_select_mask = (~(-1 << die_plus_mask_width)) >> core_plus_mask_width; @@ -148,7 +154,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) } c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, - die_plus_mask_width); + pkg_mask_width); /* * Reinit the apicid, now that we have extended initial_apicid. */ diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 621f4b6cac4a..8946f89761cc 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -210,13 +210,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpstate_reset(¤t->thread.fpu); } -static void __init fpu__init_init_fpstate(void) -{ - /* Bring init_fpstate size and features up to date */ - init_fpstate.size = fpu_kernel_cfg.max_size; - init_fpstate.xfeatures = fpu_kernel_cfg.max_features; -} - /* * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: @@ -236,5 +229,4 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(fpu_kernel_cfg.max_size); fpu__init_task_struct_size(); - fpu__init_init_fpstate(); } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c8340156bfd2..59e543b95a3c 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -360,7 +360,7 @@ static void __init setup_init_fpu_buf(void) print_xstate_features(); - xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features); + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); /* * Init all the features state with header.xfeatures being 0x0 @@ -678,20 +678,6 @@ static unsigned int __init get_xsave_size_user(void) return ebx; } -/* - * Will the runtime-enumerated 'xstate_size' fit in the init - * task's statically-allocated buffer? - */ -static bool __init is_supported_xstate_size(unsigned int test_xstate_size) -{ - if (test_xstate_size <= sizeof(init_fpstate.regs)) - return true; - - pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", - sizeof(init_fpstate.regs), test_xstate_size); - return false; -} - static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ @@ -717,10 +703,6 @@ static int __init init_xstate_size(void) kernel_default_size = xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); - /* Ensure we have the space to store all default enabled features. */ - if (!is_supported_xstate_size(kernel_default_size)) - return -EINVAL; - if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; @@ -875,6 +857,19 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); + /* + * init_fpstate excludes dynamic states as they are large but init + * state is zero. + */ + init_fpstate.size = fpu_kernel_cfg.default_size; + init_fpstate.xfeatures = fpu_kernel_cfg.default_features; + + if (init_fpstate.size > sizeof(init_fpstate.regs)) { + pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", + sizeof(init_fpstate.regs), init_fpstate.size); + goto out_disable; + } + setup_init_fpu_buf(); /* @@ -1130,6 +1125,15 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, */ mask = fpstate->user_xfeatures; + /* + * Dynamic features are not present in init_fpstate. When they are + * in an all zeros init state, remove those from 'mask' to zero + * those features in the user buffer instead of retrieving them + * from init_fpstate. + */ + if (fpu_state_size_dynamic()) + mask &= (header.xfeatures | xinit->header.xcomp_bv); + for_each_extended_xfeature(i, mask) { /* * If there was a feature or alignment gap, zero the space diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index dfeb227de561..2a4be92fd144 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -4,6 +4,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/ptrace.h> #include <asm/ftrace.h> #include <asm/export.h> @@ -129,6 +130,14 @@ .endm +SYM_TYPED_FUNC_START(ftrace_stub) + RET +SYM_FUNC_END(ftrace_stub) + +SYM_TYPED_FUNC_START(ftrace_stub_graph) + RET +SYM_FUNC_END(ftrace_stub_graph) + #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) @@ -172,21 +181,10 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) */ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_caller); STACK_FRAME_NON_STANDARD_FP(ftrace_caller) -SYM_FUNC_START(ftrace_epilogue) -/* - * This is weak to keep gas from relaxing the jumps. - */ -SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) - UNWIND_HINT_FUNC - ENDBR - RET -SYM_FUNC_END(ftrace_epilogue) - SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq @@ -262,14 +260,11 @@ SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) popfq /* - * As this jmp to ftrace_epilogue can be a short jump - * it must not be copied into the trampoline. - * The trampoline will add the code to jump - * to the return. + * The trampoline will add the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - jmp ftrace_epilogue + RET /* Swap the flags with orig_rax */ 1: movq MCOUNT_REG_SIZE(%rsp), %rdi @@ -280,7 +275,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) /* Restore flags */ popfq UNWIND_HINT_FUNC - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) @@ -291,9 +286,6 @@ STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) SYM_FUNC_START(__fentry__) cmpq $ftrace_stub, ftrace_trace_function jnz trace - -SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) - ENDBR RET trace: diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 0ea57da92940..c059820dfaea 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -713,7 +713,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, /* Otherwise, skip ahead to the user-specified starting frame: */ while (!unwind_done(state) && (!on_stack(&state->stack_info, first_frame, sizeof(long)) || - state->sp < (unsigned long)first_frame)) + state->sp <= (unsigned long)first_frame)) unwind_next_frame(state); return; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7065462378e2..62bc7a01cecc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1133,11 +1133,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = max(entry->eax, 0x80000021); break; case 0x80000001: + entry->ebx &= ~GENMASK(27, 16); cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; case 0x80000006: - /* L2 cache and TLB: pass through host info. */ + /* Drop reserved bits, pass host L2 cache and TLB info. */ + entry->edx &= ~GENMASK(17, 16); break; case 0x80000007: /* Advanced power management */ /* invariant TSC is CPUID.80000007H:EDX[8] */ @@ -1167,6 +1169,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); + entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); break; @@ -1186,6 +1189,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ecx = entry->edx = 0; break; case 0x8000001a: + entry->eax &= GENMASK(2, 0); + entry->ebx = entry->ecx = entry->edx = 0; + break; case 0x8000001e: break; case 0x8000001F: @@ -1193,7 +1199,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; } else { cpuid_entry_override(entry, CPUID_8000_001F_EAX); - + /* Clear NumVMPL since KVM does not support VMPL. */ + entry->ebx &= ~GENMASK(31, 12); /* * Enumerate '0' for "PA bits reduction", the adjusted * MAXPHYADDR is enumerated directly (see 0x80000008). @@ -1331,7 +1338,7 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, if (sanity_check_entries(entries, cpuid->nent, type)) return -EINVAL; - array.entries = kvcalloc(sizeof(struct kvm_cpuid_entry2), cpuid->nent, GFP_KERNEL); + array.entries = kvcalloc(cpuid->nent, sizeof(struct kvm_cpuid_entry2), GFP_KERNEL); if (!array.entries) return -ENOMEM; diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index cfed36aba2f7..c1390357126a 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -158,11 +158,16 @@ out: static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file) { struct kvm *kvm = inode->i_private; + int r; if (!kvm_get_kvm_safe(kvm)) return -ENOENT; - return single_open(file, kvm_mmu_rmaps_stat_show, kvm); + r = single_open(file, kvm_mmu_rmaps_stat_show, kvm); + if (r < 0) + kvm_put_kvm(kvm); + + return r; } static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3b27622d4642..4a43261d25a2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -791,8 +791,7 @@ static int linearize(struct x86_emulate_ctxt *ctxt, ctxt->mode, linear); } -static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, - enum x86emul_mode mode) +static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst) { ulong linear; int rc; @@ -802,41 +801,71 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, if (ctxt->op_bytes != sizeof(unsigned long)) addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); - rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear); + rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear); if (rc == X86EMUL_CONTINUE) ctxt->_eip = addr.ea; return rc; } +static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt) +{ + u64 efer; + struct desc_struct cs; + u16 selector; + u32 base3; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) { + /* Real mode. cpu must not have long mode active */ + if (efer & EFER_LMA) + return X86EMUL_UNHANDLEABLE; + ctxt->mode = X86EMUL_MODE_REAL; + return X86EMUL_CONTINUE; + } + + if (ctxt->eflags & X86_EFLAGS_VM) { + /* Protected/VM86 mode. cpu must not have long mode active */ + if (efer & EFER_LMA) + return X86EMUL_UNHANDLEABLE; + ctxt->mode = X86EMUL_MODE_VM86; + return X86EMUL_CONTINUE; + } + + if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS)) + return X86EMUL_UNHANDLEABLE; + + if (efer & EFER_LMA) { + if (cs.l) { + /* Proper long mode */ + ctxt->mode = X86EMUL_MODE_PROT64; + } else if (cs.d) { + /* 32 bit compatibility mode*/ + ctxt->mode = X86EMUL_MODE_PROT32; + } else { + ctxt->mode = X86EMUL_MODE_PROT16; + } + } else { + /* Legacy 32 bit / 16 bit mode */ + ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + } + + return X86EMUL_CONTINUE; +} + static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) { - return assign_eip(ctxt, dst, ctxt->mode); + return assign_eip(ctxt, dst); } -static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, - const struct desc_struct *cs_desc) +static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst) { - enum x86emul_mode mode = ctxt->mode; - int rc; + int rc = emulator_recalc_and_set_mode(ctxt); -#ifdef CONFIG_X86_64 - if (ctxt->mode >= X86EMUL_MODE_PROT16) { - if (cs_desc->l) { - u64 efer = 0; + if (rc != X86EMUL_CONTINUE) + return rc; - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - if (efer & EFER_LMA) - mode = X86EMUL_MODE_PROT64; - } else - mode = X86EMUL_MODE_PROT32; /* temporary value */ - } -#endif - if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) - mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - rc = assign_eip(ctxt, dst, mode); - if (rc == X86EMUL_CONTINUE) - ctxt->mode = mode; - return rc; + return assign_eip(ctxt, dst); } static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -2172,7 +2201,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); + rc = assign_eip_far(ctxt, ctxt->src.val); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; @@ -2250,7 +2279,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) &new_desc); if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, eip, &new_desc); + rc = assign_eip_far(ctxt, eip); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; @@ -2432,7 +2461,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); - for (i = 0; i < NR_EMULATOR_GPRS; i++) + for (i = 0; i < 8; i++) *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); val = GET_SMSTATE(u32, smstate, 0x7fcc); @@ -2489,7 +2518,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u16 selector; int i, r; - for (i = 0; i < NR_EMULATOR_GPRS; i++) + for (i = 0; i < 16; i++) *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); @@ -2633,7 +2662,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) * those side effects need to be explicitly handled for both success * and shutdown. */ - return X86EMUL_CONTINUE; + return emulator_recalc_and_set_mode(ctxt); emulate_shutdown: ctxt->ops->triple_fault(ctxt); @@ -2876,6 +2905,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); ctxt->_eip = rdx; + ctxt->mode = usermode; *reg_write(ctxt, VCPU_REGS_RSP) = rcx; return X86EMUL_CONTINUE; @@ -3469,7 +3499,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); + rc = assign_eip_far(ctxt, ctxt->src.val); if (rc != X86EMUL_CONTINUE) goto fail; @@ -3611,11 +3641,25 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) static int em_cr_write(struct x86_emulate_ctxt *ctxt) { - if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) + int cr_num = ctxt->modrm_reg; + int r; + + if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val)) return emulate_gp(ctxt, 0); /* Disable writeback. */ ctxt->dst.type = OP_NONE; + + if (cr_num == 0) { + /* + * CR0 write might have updated CR0.PE and/or CR0.PG + * which can affect the cpu's execution mode. + */ + r = emulator_recalc_and_set_mode(ctxt); + if (r != X86EMUL_CONTINUE) + return r; + } + return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 87c4e46daf37..07254314f3dd 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -24,8 +24,6 @@ extern int __read_mostly pt_mode; #define PMU_CAP_FW_WRITES (1ULL << 13) #define PMU_CAP_LBR_FMT 0x3f -#define DEBUGCTLMSR_LBR_MASK (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) - struct nested_vmx_msrs { /* * We only store the "true" versions of the VMX capability MSRs. We @@ -400,6 +398,7 @@ static inline bool vmx_pebs_supported(void) static inline u64 vmx_get_perf_capabilities(void) { u64 perf_cap = PMU_CAP_FW_WRITES; + struct x86_pmu_lbr lbr; u64 host_perf_cap = 0; if (!enable_pmu) @@ -408,7 +407,8 @@ static inline u64 vmx_get_perf_capabilities(void) if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr) + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; if (vmx_pebs_supported()) { perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; @@ -419,19 +419,6 @@ static inline u64 vmx_get_perf_capabilities(void) return perf_cap; } -static inline u64 vmx_supported_debugctl(void) -{ - u64 debugctl = 0; - - if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; - - if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) - debugctl |= DEBUGCTLMSR_LBR_MASK; - - return debugctl; -} - static inline bool cpu_has_notify_vmexit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9dba04b6b019..63247c57c72c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2021,15 +2021,17 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, return (unsigned long)data; } -static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu) +static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) { - u64 debugctl = vmx_supported_debugctl(); + u64 debugctl = 0; - if (!intel_pmu_lbr_is_enabled(vcpu)) - debugctl &= ~DEBUGCTLMSR_LBR_MASK; + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) + debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; - if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) - debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) && + (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) + debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; return debugctl; } @@ -2103,7 +2105,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_DEBUGCTLMSR: { - u64 invalid = data & ~vcpu_supported_debugctl(vcpu); + u64 invalid; + + invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { if (report_ignored_msrs) vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", @@ -8263,6 +8267,11 @@ static __init int hardware_setup(void) if (!cpu_has_virtual_nmis()) enable_vnmi = 0; +#ifdef CONFIG_X86_SGX_KVM + if (!cpu_has_vmx_encls_vmexit()) + enable_sgx = false; +#endif + /* * set_apic_access_page_addr() is used to reload apic access * page upon invalidation. No need to do anything if not diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4bd5f8a751de..5f5eb577d583 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2315,11 +2315,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, /* we verify if the enable bit is set... */ if (system_time & 1) { - kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu, - KVM_HOST_USES_PFN, system_time & ~1ULL, - sizeof(struct pvclock_vcpu_time_info)); + kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu, + KVM_HOST_USES_PFN, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info)); } else { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); } return; @@ -3388,7 +3388,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); vcpu->arch.time = 0; } @@ -6442,26 +6442,22 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, return 0; } -static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, + struct kvm_msr_filter *filter) { - struct kvm_msr_filter __user *user_msr_filter = argp; struct kvm_x86_msr_filter *new_filter, *old_filter; - struct kvm_msr_filter filter; bool default_allow; bool empty = true; int r = 0; u32 i; - if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) - return -EFAULT; - - if (filter.flags & ~KVM_MSR_FILTER_DEFAULT_DENY) + if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) - empty &= !filter.ranges[i].nmsrs; + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) + empty &= !filter->ranges[i].nmsrs; - default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); + default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY); if (empty && !default_allow) return -EINVAL; @@ -6469,8 +6465,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) if (!new_filter) return -ENOMEM; - for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { - r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) { + r = kvm_add_msr_filter(new_filter, &filter->ranges[i]); if (r) { kvm_free_msr_filter(new_filter); return r; @@ -6493,6 +6489,62 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) return 0; } +#ifdef CONFIG_KVM_COMPAT +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range_compat { + __u32 flags; + __u32 nmsrs; + __u32 base; + __u32 bitmap; +}; + +struct kvm_msr_filter_compat { + __u32 flags; + struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; + +#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat) + +long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct kvm *kvm = filp->private_data; + long r = -ENOTTY; + + switch (ioctl) { + case KVM_X86_SET_MSR_FILTER_COMPAT: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter_compat filter_compat; + struct kvm_msr_filter filter; + int i; + + if (copy_from_user(&filter_compat, user_msr_filter, + sizeof(filter_compat))) + return -EFAULT; + + filter.flags = filter_compat.flags; + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + struct kvm_msr_filter_range_compat *cr; + + cr = &filter_compat.ranges[i]; + filter.ranges[i] = (struct kvm_msr_filter_range) { + .flags = cr->flags, + .nmsrs = cr->nmsrs, + .base = cr->base, + .bitmap = (__u8 *)(ulong)cr->bitmap, + }; + } + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); + break; + } + } + + return r; +} +#endif + #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_arch_suspend_notifier(struct kvm *kvm) { @@ -6915,9 +6967,16 @@ set_pit2_out: case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; - case KVM_X86_SET_MSR_FILTER: - r = kvm_vm_ioctl_set_msr_filter(kvm, argp); + case KVM_X86_SET_MSR_FILTER: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); break; + } default: r = -ENOTTY; } @@ -9985,7 +10044,20 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, kvm_x86_ops.nested_ops->has_events(vcpu)) *req_immediate_exit = true; - WARN_ON(kvm_is_exception_pending(vcpu)); + /* + * KVM must never queue a new exception while injecting an event; KVM + * is done emulating and should only propagate the to-be-injected event + * to the VMCS/VMCB. Queueing a new exception can put the vCPU into an + * infinite loop as KVM will bail from VM-Enter to inject the pending + * exception and start the cycle all over. + * + * Exempt triple faults as they have special handling and won't put the + * vCPU into an infinite loop. Triple fault can be queued when running + * VMX without unrestricted guest, as that requires KVM to emulate Real + * Mode events (see kvm_inject_realmode_interrupt()). + */ + WARN_ON_ONCE(vcpu->arch.exception.pending || + vcpu->arch.exception_vmexit.pending); return 0; out: @@ -10332,7 +10404,10 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, kvm->arch.apicv_inhibit_reasons = new; if (new) { unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); + int idx = srcu_read_lock(&kvm->srcu); + kvm_zap_gfn_range(kvm, gfn, gfn+1); + srcu_read_unlock(&kvm->srcu, idx); } } else { kvm->arch.apicv_inhibit_reasons = new; @@ -11757,6 +11832,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + kvm_gpc_init(&vcpu->arch.pv_time); + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 93c628d3e3a9..2dae413bd62a 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -42,13 +42,13 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) int idx = srcu_read_lock(&kvm->srcu); if (gfn == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(kvm, gpc); + kvm_gpc_deactivate(kvm, gpc); goto out; } do { - ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN, - gpa, PAGE_SIZE); + ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa, + PAGE_SIZE); if (ret) goto out; @@ -554,15 +554,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) offsetof(struct compat_vcpu_info, time)); if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_info)); + r = kvm_gpc_activate(vcpu->kvm, + &vcpu->arch.xen.vcpu_info_cache, NULL, + KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct vcpu_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -570,16 +570,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct pvclock_vcpu_time_info)); + r = kvm_gpc_activate(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache, + NULL, KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct pvclock_vcpu_time_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); break; @@ -590,16 +590,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(vcpu->kvm, + &vcpu->arch.xen.runstate_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.runstate_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_runstate_info)); + r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache, + NULL, KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct vcpu_runstate_info)); break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: @@ -1667,18 +1666,18 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, case EVTCHNSTAT_ipi: /* IPI must map back to the same port# */ if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port) - goto out; /* -EINVAL */ + goto out_noeventfd; /* -EINVAL */ break; case EVTCHNSTAT_interdomain: if (data->u.evtchn.deliver.port.port) { if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm)) - goto out; /* -EINVAL */ + goto out_noeventfd; /* -EINVAL */ } else { eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd); if (IS_ERR(eventfd)) { ret = PTR_ERR(eventfd); - goto out; + goto out_noeventfd; } } break; @@ -1718,6 +1717,7 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, out: if (eventfd) eventfd_ctx_put(eventfd); +out_noeventfd: kfree(evtchnfd); return ret; } @@ -1816,7 +1816,12 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) { vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx; vcpu->arch.xen.poll_evtchn = 0; + timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); + + kvm_gpc_init(&vcpu->arch.xen.runstate_cache); + kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache); } void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) @@ -1824,18 +1829,17 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) if (kvm_xen_timer_enabled(vcpu)) kvm_xen_stop_timer(vcpu); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache); + del_timer_sync(&vcpu->arch.xen.poll_timer); } void kvm_xen_init_vm(struct kvm *kvm) { idr_init(&kvm->arch.xen.evtchn_ports); + kvm_gpc_init(&kvm->arch.xen.shinfo_cache); } void kvm_xen_destroy_vm(struct kvm *kvm) @@ -1843,7 +1847,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm) struct evtchnfd *evtchnfd; int i; - kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); + kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache); idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { if (!evtchnfd->deliver.port.port) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 97342c42dda8..2e5a045731de 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -587,6 +587,10 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star { unsigned long end; + /* Kernel text is rw at boot up */ + if (system_state == SYSTEM_BOOTING) + return new; + /* * 32-bit has some unfixable W+X issues, like EFI code * and writeable data being in the same page. Disable diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 99620428ad78..00127abd89ee 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -11,6 +11,7 @@ #include <linux/bpf.h> #include <linux/memory.h> #include <linux/sort.h> +#include <linux/init.h> #include <asm/extable.h> #include <asm/set_memory.h> #include <asm/nospec-branch.h> @@ -388,6 +389,18 @@ out: return ret; } +int __init bpf_arch_init_dispatcher_early(void *ip) +{ + const u8 *nop_insn = x86_nops[5]; + + if (is_endbr(*(u32 *)ip)) + ip += ENDBR_INSN_SIZE; + + if (memcmp(ip, nop_insn, X86_PATCH_SIZE)) + text_poke_early(ip, nop_insn, X86_PATCH_SIZE); + return 0; +} + int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *old_addr, void *new_addr) { diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 58a200dc762d..17f09dc26381 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -26,6 +26,7 @@ GCOV_PROFILE := n KASAN_SANITIZE := n UBSAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n KCOV_INSTRUMENT := n # These are adjustments to the compiler flags used for objects that diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 68aff1382872..246d67dab510 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -302,7 +302,7 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) static bool pmu_msr_chk_emulated(unsigned int msr, uint64_t *val, bool is_read, bool *emul) { - int type, index; + int type, index = 0; if (is_amd_pmu_msr(msr)) *emul = xen_amd_pmu_emulate(msr, val, is_read); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index cfa99e8f054b..4f4309500559 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -910,17 +910,9 @@ static int register_callback(unsigned type, const void *func) void xen_enable_sysenter(void) { - int ret; - unsigned sysenter_feature; - - sysenter_feature = X86_FEATURE_SYSENTER32; - - if (!boot_cpu_has(sysenter_feature)) - return; - - ret = register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat); - if(ret != 0) - setup_clear_cpu_cap(sysenter_feature); + if (cpu_feature_enabled(X86_FEATURE_SYSENTER32) && + register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat)) + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); } void xen_enable_syscall(void) @@ -934,12 +926,9 @@ void xen_enable_syscall(void) mechanism for syscalls. */ } - if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { - ret = register_callback(CALLBACKTYPE_syscall32, - xen_entry_SYSCALL_compat); - if (ret != 0) - setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); - } + if (cpu_feature_enabled(X86_FEATURE_SYSCALL32) && + register_callback(CALLBACKTYPE_syscall32, xen_entry_SYSCALL_compat)) + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); } static void __init xen_pvmmu_arch_setup(void) |