diff options
Diffstat (limited to 'arch/x86/kernel')
100 files changed, 3756 insertions, 2403 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a20a5ebfacd7..96d51bbc2bd4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -3,10 +3,6 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o -extra-y += head$(BITS).o -extra-y += ebda.o -extra-y += platform-quirks.o extra-y += vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) @@ -33,6 +29,8 @@ KASAN_SANITIZE_sev.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. KCSAN_SANITIZE := n +KMSAN_SANITIZE_head$(BITS).o := n +KMSAN_SANITIZE_nmi.o := n # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, @@ -42,7 +40,11 @@ KCOV_INSTRUMENT := n CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace -obj-y := process_$(BITS).o signal.o +obj-y += head_$(BITS).o +obj-y += head$(BITS).o +obj-y += ebda.o +obj-y += platform-quirks.o +obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-$(CONFIG_COMPAT) += signal_compat.o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o @@ -52,7 +54,7 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_ia32.o -obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o +obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o signal_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o @@ -139,6 +141,10 @@ obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o +obj-$(CONFIG_CFI_CLANG) += cfi.o + +obj-$(CONFIG_CALL_THUNKS) += callthunks.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 7945eae5b315..401808b47af3 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -52,17 +52,25 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, if (c->x86_vendor == X86_VENDOR_INTEL && (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) flags->bm_control = 0; - /* - * For all recent Centaur CPUs, the ucode will make sure that each - * core can keep cache coherence with each other while entering C3 - * type state. So, set bm_check to 1 to indicate that the kernel - * doesn't need to execute a cache flush operation (WBINVD) when - * entering C3 type state. - */ + if (c->x86_vendor == X86_VENDOR_CENTAUR) { if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f && - c->x86_stepping >= 0x0e)) + c->x86_stepping >= 0x0e)) { + /* + * For all recent Centaur CPUs, the ucode will make sure that each + * core can keep cache coherence with each other while entering C3 + * type state. So, set bm_check to 1 to indicate that the kernel + * doesn't need to execute a cache flush operation (WBINVD) when + * entering C3 type state. + */ flags->bm_check = 1; + /* + * For all recent Centaur platforms, ARB_DISABLE is a nop. + * Set bm_control to zero to indicate that ARB_DISABLE is + * not required while entering C3 type state. + */ + flags->bm_control = 0; + } } if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 62f6b8b7c4a5..7d8c3cbde368 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -116,6 +116,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; +extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; @@ -377,6 +378,56 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } +static inline bool is_jcc32(struct insn *insn) +{ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; +} + +static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 op = insn->opcode.bytes[0]; + int i = 0; + + /* + * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional + * tail-calls. Deal with them. + */ + if (is_jcc32(insn)) { + bytes[i++] = op; + op = insn->opcode.bytes[1]; + goto clang_jcc; + } + + if (insn->length == 6) + bytes[i++] = 0x2e; /* CS-prefix */ + + switch (op) { + case CALL_INSN_OPCODE: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_call_thunk_array[reg], + CALL_INSN_SIZE); + i += CALL_INSN_SIZE; + break; + + case JMP32_INSN_OPCODE: +clang_jcc: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_jump_thunk_array[reg], + JMP32_INSN_SIZE); + i += JMP32_INSN_SIZE; + break; + + default: + WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); + return -1; + } + + WARN_ON_ONCE(i != insn->length); + + return i; +} + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -409,8 +460,12 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return emit_call_track_retpoline(addr, insn, reg, bytes); + return -1; + } op = insn->opcode.bytes[0]; @@ -427,8 +482,7 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) * [ NOP ] * 1: */ - /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ - if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + if (is_jcc32(insn)) { cc = insn->opcode.bytes[1] & 0xf; cc ^= 1; /* invert condition */ @@ -453,6 +507,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) return ret; i += ret; + /* + * The compiler is supposed to EMIT an INT3 after every unconditional + * JMP instruction due to AMD BTC. However, if the compiler is too old + * or SLS isn't enabled, we still need an INT3 after indirect JMPs + * even on Intel. + */ + if (op == JMP32_INSN_OPCODE && i < insn->length) + bytes[i++] = INT3_INSN_OPCODE; + for (; i < insn->length;) bytes[i++] = BYTES_NOP1; @@ -509,6 +572,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } #ifdef CONFIG_RETHUNK + +#ifdef CONFIG_CALL_THUNKS +void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; +#endif + /* * Rewrite the compiler generated return thunk tail-calls. * @@ -524,14 +592,18 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - return -1; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (x86_return_thunk == __x86_return_thunk) + return -1; - bytes[i++] = RET_INSN_OPCODE; + i = JMP32_INSN_SIZE; + __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); + } else { + bytes[i++] = RET_INSN_OPCODE; + } for (; i < insn->length;) bytes[i++] = INT3_INSN_OPCODE; - return i; } @@ -585,6 +657,28 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #ifdef CONFIG_X86_KERNEL_IBT +static void poison_endbr(void *addr, bool warn) +{ + u32 endbr, poison = gen_endbr_poison(); + + if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + return; + + if (!is_endbr(endbr)) { + WARN_ON_ONCE(warn); + return; + } + + DPRINTK("ENDBR at: %pS (%px)", addr, addr); + + /* + * When we have IBT, the lack of ENDBR will trigger #CP + */ + DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); + text_poke_early(addr, &poison, 4); +} + /* * Generated by: objtool --ibt */ @@ -593,31 +687,391 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) s32 *s; for (s = start; s < end; s++) { - u32 endbr, poison = gen_endbr_poison(); void *addr = (void *)s + *s; - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + poison_endbr(addr, true); + if (IS_ENABLED(CONFIG_FINEIBT)) + poison_endbr(addr - 16, false); + } +} + +#else + +void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { } + +#endif /* CONFIG_X86_KERNEL_IBT */ + +#ifdef CONFIG_FINEIBT + +enum cfi_mode { + CFI_DEFAULT, + CFI_OFF, + CFI_KCFI, + CFI_FINEIBT, +}; + +static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT; +static bool cfi_rand __ro_after_init = true; +static u32 cfi_seed __ro_after_init; + +/* + * Re-hash the CFI hash with a boot-time seed while making sure the result is + * not a valid ENDBR instruction. + */ +static u32 cfi_rehash(u32 hash) +{ + hash ^= cfi_seed; + while (unlikely(is_endbr(hash) || is_endbr(-hash))) { + bool lsb = hash & 1; + hash >>= 1; + if (lsb) + hash ^= 0x80200003; + } + return hash; +} + +static __init int cfi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "auto")) { + cfi_mode = CFI_DEFAULT; + } else if (!strcmp(str, "off")) { + cfi_mode = CFI_OFF; + cfi_rand = false; + } else if (!strcmp(str, "kcfi")) { + cfi_mode = CFI_KCFI; + } else if (!strcmp(str, "fineibt")) { + cfi_mode = CFI_FINEIBT; + } else if (!strcmp(str, "norand")) { + cfi_rand = false; + } else { + pr_err("Ignoring unknown cfi option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("cfi", cfi_parse_cmdline); + +/* + * kCFI FineIBT + * + * __cfi_\func: __cfi_\func: + * movl $0x12345678,%eax // 5 endbr64 // 4 + * nop subl $0x12345678,%r10d // 7 + * nop jz 1f // 2 + * nop ud2 // 2 + * nop 1: nop // 1 + * nop + * nop + * nop + * nop + * nop + * nop + * nop + * + * + * caller: caller: + * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 + * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 + * je 1f // 2 nop4 // 4 + * ud2 // 2 + * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 + * + */ + +asm( ".pushsection .rodata \n" + "fineibt_preamble_start: \n" + " endbr64 \n" + " subl $0x12345678, %r10d \n" + " je fineibt_preamble_end \n" + " ud2 \n" + " nop \n" + "fineibt_preamble_end: \n" + ".popsection\n" +); + +extern u8 fineibt_preamble_start[]; +extern u8 fineibt_preamble_end[]; + +#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) +#define fineibt_preamble_hash 7 + +asm( ".pushsection .rodata \n" + "fineibt_caller_start: \n" + " movl $0x12345678, %r10d \n" + " sub $16, %r11 \n" + ASM_NOP4 + "fineibt_caller_end: \n" + ".popsection \n" +); + +extern u8 fineibt_caller_start[]; +extern u8 fineibt_caller_end[]; + +#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) +#define fineibt_caller_hash 2 + +#define fineibt_caller_jmp (fineibt_caller_size - 2) + +static u32 decode_preamble_hash(void *addr) +{ + u8 *p = addr; + + /* b8 78 56 34 12 mov $0x12345678,%eax */ + if (p[0] == 0xb8) + return *(u32 *)(addr + 1); + + return 0; /* invalid hash value */ +} + +static u32 decode_caller_hash(void *addr) +{ + u8 *p = addr; + + /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ + if (p[0] == 0x41 && p[1] == 0xba) + return -*(u32 *)(addr + 2); + + /* e8 0c 78 56 34 12 jmp.d8 +12 */ + if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) + return -*(u32 *)(addr + 2); + + return 0; /* invalid hash value */ +} + +/* .retpoline_sites */ +static int cfi_disable_callers(s32 *start, s32 *end) +{ + /* + * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate + * in tact for later usage. Also see decode_caller_hash() and + * cfi_rewrite_callers(). + */ + const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ continue; - if (WARN_ON_ONCE(!is_endbr(endbr))) + text_poke_early(addr, jmp, 2); + } + + return 0; +} + +static int cfi_enable_callers(s32 *start, s32 *end) +{ + /* + * Re-enable kCFI, undo what cfi_disable_callers() did. + */ + const u8 mov[] = { 0x41, 0xba }; + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ continue; - DPRINTK("ENDBR at: %pS (%px)", addr, addr); + text_poke_early(addr, mov, 2); + } - /* - * When we have IBT, the lack of ENDBR will trigger #CP - */ - DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); - DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(addr, &poison, 4); + return 0; +} + +/* .cfi_sites */ +static int cfi_rand_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + hash = cfi_rehash(hash); + text_poke_early(addr + 1, &hash, 4); + } + + return 0; +} + +static int cfi_rewrite_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + } + + return 0; +} + +/* .retpoline_sites */ +static int cfi_rand_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + hash = -cfi_rehash(hash); + text_poke_early(addr + 2, &hash, 4); + } + } + + return 0; +} + +static int cfi_rewrite_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); + } + /* rely on apply_retpolines() */ + } + + return 0; +} + +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ + int ret; + + if (WARN_ONCE(fineibt_preamble_size != 16, + "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) + return; + + if (cfi_mode == CFI_DEFAULT) { + cfi_mode = CFI_KCFI; + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + cfi_mode = CFI_FINEIBT; + } + + /* + * Rewrite the callers to not use the __cfi_ stubs, such that we might + * rewrite them. This disables all CFI. If this succeeds but any of the + * later stages fails, we're without CFI. + */ + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (cfi_rand) { + if (builtin) + cfi_seed = get_random_u32(); + + ret = cfi_rand_preamble(start_cfi, end_cfi); + if (ret) + goto err; + + ret = cfi_rand_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + } + + switch (cfi_mode) { + case CFI_OFF: + if (builtin) + pr_info("Disabling CFI\n"); + return; + + case CFI_KCFI: + ret = cfi_enable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (builtin) + pr_info("Using kCFI\n"); + return; + + case CFI_FINEIBT: + ret = cfi_rewrite_preamble(start_cfi, end_cfi); + if (ret) + goto err; + + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (builtin) + pr_info("Using FineIBT CFI\n"); + return; + + default: + break; } + +err: + pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); } #else -void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ +} -#endif /* CONFIG_X86_KERNEL_IBT */ +#endif + +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ + return __apply_fineibt(start_retpoline, end_retpoline, + start_cfi, end_cfi, + /* .builtin = */ false); +} #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, @@ -925,6 +1379,9 @@ void __init alternative_instructions(void) */ apply_paravirt(__parainstructions, __parainstructions_end); + __apply_fineibt(__retpoline_sites, __retpoline_sites_end, + __cfi_sites, __cfi_sites_end, true); + /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. @@ -938,6 +1395,12 @@ void __init alternative_instructions(void) */ apply_alternatives(__alt_instructions, __alt_instructions_end); + /* + * Now all calls are established. Apply the call thunks if + * required. + */ + callthunks_patch_builtin_calls(); + apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP @@ -1227,27 +1690,15 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) return __text_poke(text_poke_memcpy, addr, opcode, len); } -/** - * text_poke_copy - Copy instructions into (an unused part of) RX memory - * @addr: address to modify - * @opcode: source of the copy - * @len: length to copy, could be more than 2x PAGE_SIZE - * - * Not safe against concurrent execution; useful for JITs to dump - * new code blocks into unused regions of RX memory. Can be used in - * conjunction with synchronize_rcu_tasks() to wait for existing - * execution to quiesce after having made sure no existing functions - * pointers are live. - */ -void *text_poke_copy(void *addr, const void *opcode, size_t len) +void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, + bool core_ok) { unsigned long start = (unsigned long)addr; size_t patched = 0; - if (WARN_ON_ONCE(core_kernel_text(start))) + if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) return NULL; - mutex_lock(&text_mutex); while (patched < len) { unsigned long ptr = start + patched; size_t s; @@ -1257,6 +1708,25 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len) __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); patched += s; } + return addr; +} + +/** + * text_poke_copy - Copy instructions into (an unused part of) RX memory + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * Not safe against concurrent execution; useful for JITs to dump + * new code blocks into unused regions of RX memory. Can be used in + * conjunction with synchronize_rcu_tasks() to wait for existing + * execution to quiesce after having made sure no existing functions + * pointers are live. + */ +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + mutex_lock(&text_mutex); + addr = text_poke_copy_locked(addr, opcode, len, false); mutex_unlock(&text_mutex); return addr; } @@ -1319,22 +1789,23 @@ struct bp_patching_desc { atomic_t refs; }; -static struct bp_patching_desc *bp_desc; +static struct bp_patching_desc bp_desc; static __always_inline -struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) +struct bp_patching_desc *try_get_desc(void) { - /* rcu_dereference */ - struct bp_patching_desc *desc = __READ_ONCE(*descp); + struct bp_patching_desc *desc = &bp_desc; - if (!desc || !arch_atomic_inc_not_zero(&desc->refs)) + if (!arch_atomic_inc_not_zero(&desc->refs)) return NULL; return desc; } -static __always_inline void put_desc(struct bp_patching_desc *desc) +static __always_inline void put_desc(void) { + struct bp_patching_desc *desc = &bp_desc; + smp_mb__before_atomic(); arch_atomic_dec(&desc->refs); } @@ -1367,15 +1838,15 @@ noinstr int poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc: + * bp_desc with non-zero refcount: * - * bp_desc = desc INT3 + * bp_desc.refs = 1 INT3 * WMB RMB - * write INT3 if (desc) + * write INT3 if (bp_desc.refs != 0) */ smp_rmb(); - desc = try_get_desc(&bp_desc); + desc = try_get_desc(); if (!desc) return 0; @@ -1429,7 +1900,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(desc); + put_desc(); return ret; } @@ -1460,18 +1931,20 @@ static int tp_vec_nr; */ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { - struct bp_patching_desc desc = { - .vec = tp, - .nr_entries = nr_entries, - .refs = ATOMIC_INIT(1), - }; unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; lockdep_assert_held(&text_mutex); - smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ + bp_desc.vec = tp; + bp_desc.nr_entries = nr_entries; + + /* + * Corresponds to the implicit memory barrier in try_get_desc() to + * ensure reading a non-zero refcount provides up to date bp_desc data. + */ + atomic_set_release(&bp_desc.refs, 1); /* * Corresponding read barrier in int3 notifier for making sure the @@ -1559,12 +2032,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries text_poke_sync(); /* - * Remove and synchronize_rcu(), except we have a very primitive - * refcount based completion. + * Remove and wait for refs to be zero. */ - WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ - if (!atomic_dec_and_test(&desc.refs)) - atomic_cond_read_acquire(&desc.refs, !VAL); + if (!atomic_dec_and_test(&bp_desc.refs)) + atomic_cond_read_acquire(&bp_desc.refs, !VAL); } static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, @@ -1598,7 +2069,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, default: BUG_ON(len != insn.length); - }; + } switch (tp->opcode) { @@ -1671,11 +2142,6 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi { struct text_poke_loc *tp; - if (unlikely(system_state == SYSTEM_BOOTING)) { - text_poke_early(addr, opcode, len); - return; - } - text_poke_flush(addr); tp = &tp_vec[tp_vec_nr++]; @@ -1697,11 +2163,6 @@ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void * { struct text_poke_loc tp; - if (unlikely(system_state == SYSTEM_BOOTING)) { - text_poke_early(addr, opcode, len); - return; - } - text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 194d54eed537..56a917df410d 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -53,7 +53,7 @@ static u32 *iommu_gatt_base; /* Remapping table */ * of only flushing when an mapping is reused. With it true the GART is * flushed for every mapping. Problem is that doing the lazy flush seems * to trigger bugs with some popular PCI cards, in particular 3ware (but - * has been also also seen with Qlogic at least). + * has been also seen with Qlogic at least). */ static int iommu_fullflush = 1; @@ -504,7 +504,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) } a = aper + iommu_size; - iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; + iommu_size -= round_up(a, PMD_SIZE) - a; if (iommu_size < 64*1024*1024) { pr_warn("PCI-DMA: Warning: Small IOMMU %luMB." diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 7a5630d904b2..4feaa670d578 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -36,7 +36,7 @@ /* * Using 512M as goal, in case kexec will load kernel_big * that will do the on-position decompress, and could overlap with - * with the gart aperture that is used. + * the gart aperture that is used. * Sequence: * kernel_small * ==> kexec (with kdump trigger path or gart still enabled) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6d303d1d276c..20d9a604da7c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -61,6 +61,7 @@ #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include <asm/irq_regs.h> +#include <asm/cpu.h> unsigned int num_processors; @@ -1751,11 +1752,26 @@ EXPORT_SYMBOL_GPL(x2apic_mode); enum { X2APIC_OFF, - X2APIC_ON, X2APIC_DISABLED, + /* All states below here have X2APIC enabled */ + X2APIC_ON, + X2APIC_ON_LOCKED }; static int x2apic_state; +static bool x2apic_hw_locked(void) +{ + u64 ia32_cap; + u64 msr; + + ia32_cap = x86_read_arch_cap_msr(); + if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { + rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); + return (msr & LEGACY_XAPIC_DISABLED); + } + return false; +} + static void __x2apic_disable(void) { u64 msr; @@ -1793,6 +1809,10 @@ static int __init setup_nox2apic(char *str) apicid); return 0; } + if (x2apic_hw_locked()) { + pr_warn("APIC locked in x2apic mode, can't disable\n"); + return 0; + } pr_warn("x2apic already enabled.\n"); __x2apic_disable(); } @@ -1807,10 +1827,18 @@ early_param("nox2apic", setup_nox2apic); void x2apic_setup(void) { /* - * If x2apic is not in ON state, disable it if already enabled + * Try to make the AP's APIC state match that of the BSP, but if the + * BSP is unlocked and the AP is locked then there is a state mismatch. + * Warn about the mismatch in case a GP fault occurs due to a locked AP + * trying to be turned off. + */ + if (x2apic_state != X2APIC_ON_LOCKED && x2apic_hw_locked()) + pr_warn("x2apic lock mismatch between BSP and AP.\n"); + /* + * If x2apic is not in ON or LOCKED state, disable it if already enabled * from BIOS. */ - if (x2apic_state != X2APIC_ON) { + if (x2apic_state < X2APIC_ON) { __x2apic_disable(); return; } @@ -1831,6 +1859,11 @@ static __init void x2apic_disable(void) if (x2apic_id >= 255) panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + if (x2apic_hw_locked()) { + pr_warn("Cannot disable locked x2apic, id: %08x\n", x2apic_id); + return; + } + __x2apic_disable(); register_lapic_address(mp_lapic_addr); } @@ -1889,22 +1922,28 @@ void __init check_x2apic(void) if (x2apic_enabled()) { pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n"); x2apic_mode = 1; - x2apic_state = X2APIC_ON; + if (x2apic_hw_locked()) + x2apic_state = X2APIC_ON_LOCKED; + else + x2apic_state = X2APIC_ON; } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { x2apic_state = X2APIC_DISABLED; } } #else /* CONFIG_X86_X2APIC */ -static int __init validate_x2apic(void) +void __init check_x2apic(void) { if (!apic_is_x2apic_enabled()) - return 0; + return; /* - * Checkme: Can we simply turn off x2apic here instead of panic? + * Checkme: Can we simply turn off x2APIC here instead of disabling the APIC? */ - panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n"); + pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n"); + pr_err("Disabling APIC, expect reduced performance and functionality.\n"); + + disable_apic = 1; + setup_clear_cpu_cap(X86_FEATURE_APIC); } -early_initcall(validate_x2apic); static inline void try_to_enable_x2apic(int remap_mode) { } static inline void __x2apic_enable(void) { } diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 7517eb05bdc1..35d5b8fb18ef 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -142,70 +142,139 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) return ret; } -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. +/** + * pci_dev_has_default_msi_parent_domain - Check whether the device has the default + * MSI parent domain associated + * @dev: Pointer to the PCI device */ -static struct irq_chip pci_msi_controller = { - .name = "PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, - .irq_ack = irq_chip_ack_parent, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_set_affinity = msi_set_affinity, - .flags = IRQCHIP_SKIP_SET_WAKE | - IRQCHIP_AFFINITY_PRE_STARTUP, -}; +bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev) +{ + struct irq_domain *domain = dev_get_msi_domain(&dev->dev); -int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, - msi_alloc_info_t *arg) + if (!domain) + domain = dev_get_msi_domain(&dev->bus->dev); + if (!domain) + return false; + + return domain == x86_vector_domain; +} + +/** + * x86_msi_prepare - Setup of msi_alloc_info_t for allocations + * @domain: The domain for which this setup happens + * @dev: The device for which interrupts are allocated + * @nvec: The number of vectors to allocate + * @alloc: The allocation info structure to initialize + * + * This function is to be used for all types of MSI domains above the x86 + * vector domain and any intermediates. It is always invoked from the + * top level interrupt domain. The domain specific allocation + * functionality is determined via the @domain's bus token which allows to + * map the X86 specific allocation type. + */ +static int x86_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *alloc) { - init_irq_alloc_info(arg, NULL); - if (to_pci_dev(dev)->msix_enabled) { - arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; - } else { - arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; - arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + struct msi_domain_info *info = domain->host_data; + + init_irq_alloc_info(alloc, NULL); + + switch (info->bus_token) { + case DOMAIN_BUS_PCI_DEVICE_MSI: + alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; + return 0; + case DOMAIN_BUS_PCI_DEVICE_MSIX: + case DOMAIN_BUS_PCI_DEVICE_IMS: + alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; + return 0; + default: + return -EINVAL; } - - return 0; } -EXPORT_SYMBOL_GPL(pci_msi_prepare); -static struct msi_domain_ops pci_msi_domain_ops = { - .msi_prepare = pci_msi_prepare, -}; +/** + * x86_init_dev_msi_info - Domain info setup for MSI domains + * @dev: The device for which the domain should be created + * @domain: The (root) domain providing this callback + * @real_parent: The real parent domain of the to initialize domain + * @info: The domain info for the to initialize domain + * + * This function is to be used for all types of MSI domains above the x86 + * vector domain and any intermediates. The domain specific functionality + * is determined via the @real_parent. + */ +static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + const struct msi_parent_ops *pops = real_parent->msi_parent_ops; + + /* MSI parent domain specific settings */ + switch (real_parent->bus_token) { + case DOMAIN_BUS_ANY: + /* Only the vector domain can have the ANY token */ + if (WARN_ON_ONCE(domain != real_parent)) + return false; + info->chip->irq_set_affinity = msi_set_affinity; + /* See msi_set_affinity() for the gory details */ + info->flags |= MSI_FLAG_NOMASK_QUIRK; + break; + case DOMAIN_BUS_DMAR: + case DOMAIN_BUS_AMDVI: + break; + default: + WARN_ON_ONCE(1); + return false; + } + + /* Is the target supported? */ + switch(info->bus_token) { + case DOMAIN_BUS_PCI_DEVICE_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSIX: + break; + case DOMAIN_BUS_PCI_DEVICE_IMS: + if (!(pops->supported_flags & MSI_FLAG_PCI_IMS)) + return false; + break; + default: + WARN_ON_ONCE(1); + return false; + } + + /* + * Mask out the domain specific MSI feature flags which are not + * supported by the real parent. + */ + info->flags &= pops->supported_flags; + /* Enforce the required flags */ + info->flags |= X86_VECTOR_MSI_FLAGS_REQUIRED; + + /* This is always invoked from the top level MSI domain! */ + info->ops->msi_prepare = x86_msi_prepare; + + info->chip->irq_ack = irq_chip_ack_parent; + info->chip->irq_retrigger = irq_chip_retrigger_hierarchy; + info->chip->flags |= IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP; -static struct msi_domain_info pci_msi_domain_info = { - .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_PCI_MSIX, - .ops = &pci_msi_domain_ops, - .chip = &pci_msi_controller, - .handler = handle_edge_irq, - .handler_name = "edge", + info->handler = handle_edge_irq; + info->handler_name = "edge"; + + return true; +} + +static const struct msi_parent_ops x86_vector_msi_parent_ops = { + .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED, + .init_dev_msi_info = x86_init_dev_msi_info, }; struct irq_domain * __init native_create_pci_msi_domain(void) { - struct fwnode_handle *fn; - struct irq_domain *d; - if (disable_apic) return NULL; - fn = irq_domain_alloc_named_fwnode("PCI-MSI"); - if (!fn) - return NULL; - - d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info, - x86_vector_domain); - if (!d) { - irq_domain_free_fwnode(fn); - pr_warn("Failed to initialize PCI-MSI irqdomain.\n"); - } else { - d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; - } - return d; + x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + x86_vector_domain->msi_parent_ops = &x86_vector_msi_parent_ops; + return x86_vector_domain; } void __init x86_create_pci_msi_domain(void) @@ -213,41 +282,19 @@ void __init x86_create_pci_msi_domain(void) x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain(); } -#ifdef CONFIG_IRQ_REMAP -static struct irq_chip pci_msi_ir_controller = { - .name = "IR-PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, - .irq_ack = irq_chip_ack_parent, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .flags = IRQCHIP_SKIP_SET_WAKE | - IRQCHIP_AFFINITY_PRE_STARTUP, -}; - -static struct msi_domain_info pci_msi_ir_domain_info = { - .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, - .ops = &pci_msi_domain_ops, - .chip = &pci_msi_ir_controller, - .handler = handle_edge_irq, - .handler_name = "edge", -}; - -struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, - const char *name, int id) +/* Keep around for hyperV */ +int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, + msi_alloc_info_t *arg) { - struct fwnode_handle *fn; - struct irq_domain *d; + init_irq_alloc_info(arg, NULL); - fn = irq_domain_alloc_named_id_fwnode(name, id); - if (!fn) - return NULL; - d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent); - if (!d) - irq_domain_free_fwnode(fn); - return d; + if (to_pci_dev(dev)->msix_enabled) + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; + else + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; + return 0; } -#endif +EXPORT_SYMBOL_GPL(pci_msi_prepare); #ifdef CONFIG_DMAR_TABLE /* diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3e6f6b448f6a..c1efebd27e6c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -539,10 +539,6 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, if (disable_apic) return -ENXIO; - /* Currently vector allocator can't guarantee contiguous allocations */ - if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) - return -ENOSYS; - /* * Catch any attempt to touch the cascade interrupt on a PIC * equipped system. diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cb50589a7102..82c783da16a8 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -19,7 +19,6 @@ #include <asm/suspend.h> #include <asm/tlbflush.h> #include <asm/tdx.h> -#include "../kvm/vmx/vmx.h" #ifdef CONFIG_XEN #include <xen/interface/xen.h> @@ -108,9 +107,9 @@ static void __used common(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); + OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); +#ifdef CONFIG_CALL_DEPTH_TRACKING + OFFSET(X86_call_depth, pcpu_hot, call_depth); +#endif - if (IS_ENABLED(CONFIG_KVM_INTEL)) { - BLANK(); - OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl); - } } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 9b698215d261..bb65371ea9df 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -57,7 +57,7 @@ int main(void) BLANK(); #ifdef CONFIG_STACKPROTECTOR - DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); + OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary); BLANK(); #endif return 0; diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c new file mode 100644 index 000000000000..7d2c75ec9a8c --- /dev/null +++ b/arch/x86/kernel/callthunks.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "callthunks: " fmt + +#include <linux/debugfs.h> +#include <linux/kallsyms.h> +#include <linux/memory.h> +#include <linux/moduleloader.h> +#include <linux/static_call.h> + +#include <asm/alternative.h> +#include <asm/asm-offsets.h> +#include <asm/cpu.h> +#include <asm/ftrace.h> +#include <asm/insn.h> +#include <asm/kexec.h> +#include <asm/nospec-branch.h> +#include <asm/paravirt.h> +#include <asm/sections.h> +#include <asm/switch_to.h> +#include <asm/sync_core.h> +#include <asm/text-patching.h> +#include <asm/xen/hypercall.h> + +static int __initdata_or_module debug_callthunks; + +#define prdbg(fmt, args...) \ +do { \ + if (debug_callthunks) \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ +} while(0) + +static int __init debug_thunks(char *str) +{ + debug_callthunks = 1; + return 1; +} +__setup("debug-callthunks", debug_thunks); + +#ifdef CONFIG_CALL_THUNKS_DEBUG +DEFINE_PER_CPU(u64, __x86_call_count); +DEFINE_PER_CPU(u64, __x86_ret_count); +DEFINE_PER_CPU(u64, __x86_stuffs_count); +DEFINE_PER_CPU(u64, __x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_call_count); +#endif + +extern s32 __call_sites[], __call_sites_end[]; + +struct thunk_desc { + void *template; + unsigned int template_size; +}; + +struct core_text { + unsigned long base; + unsigned long end; + const char *name; +}; + +static bool thunks_initialized __ro_after_init; + +static const struct core_text builtin_coretext = { + .base = (unsigned long)_text, + .end = (unsigned long)_etext, + .name = "builtin", +}; + +asm ( + ".pushsection .rodata \n" + ".global skl_call_thunk_template \n" + "skl_call_thunk_template: \n" + __stringify(INCREMENT_CALL_DEPTH)" \n" + ".global skl_call_thunk_tail \n" + "skl_call_thunk_tail: \n" + ".popsection \n" +); + +extern u8 skl_call_thunk_template[]; +extern u8 skl_call_thunk_tail[]; + +#define SKL_TMPL_SIZE \ + ((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template)) + +extern void error_entry(void); +extern void xen_error_entry(void); +extern void paranoid_entry(void); + +static inline bool within_coretext(const struct core_text *ct, void *addr) +{ + unsigned long p = (unsigned long)addr; + + return ct->base <= p && p < ct->end; +} + +static inline bool within_module_coretext(void *addr) +{ + bool ret = false; + +#ifdef CONFIG_MODULES + struct module *mod; + + preempt_disable(); + mod = __module_address((unsigned long)addr); + if (mod && within_module_core((unsigned long)addr, mod)) + ret = true; + preempt_enable(); +#endif + return ret; +} + +static bool is_coretext(const struct core_text *ct, void *addr) +{ + if (ct && within_coretext(ct, addr)) + return true; + if (within_coretext(&builtin_coretext, addr)) + return true; + return within_module_coretext(addr); +} + +static __init_or_module bool skip_addr(void *dest) +{ + if (dest == error_entry) + return true; + if (dest == paranoid_entry) + return true; + if (dest == xen_error_entry) + return true; + /* Does FILL_RSB... */ + if (dest == __switch_to_asm) + return true; + /* Accounts directly */ + if (dest == ret_from_fork) + return true; +#ifdef CONFIG_HOTPLUG_CPU + if (dest == start_cpu0) + return true; +#endif +#ifdef CONFIG_FUNCTION_TRACER + if (dest == __fentry__) + return true; +#endif +#ifdef CONFIG_KEXEC_CORE + if (dest >= (void *)relocate_kernel && + dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) + return true; +#endif +#ifdef CONFIG_XEN + if (dest >= (void *)hypercall_page && + dest < (void*)hypercall_page + PAGE_SIZE) + return true; +#endif + return false; +} + +static __init_or_module void *call_get_dest(void *addr) +{ + struct insn insn; + void *dest; + int ret; + + ret = insn_decode_kernel(&insn, addr); + if (ret) + return ERR_PTR(ret); + + /* Patched out call? */ + if (insn.opcode.bytes[0] != CALL_INSN_OPCODE) + return NULL; + + dest = addr + insn.length + insn.immediate.value; + if (skip_addr(dest)) + return NULL; + return dest; +} + +static const u8 nops[] = { + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, +}; + +static __init_or_module void *patch_dest(void *dest, bool direct) +{ + unsigned int tsize = SKL_TMPL_SIZE; + u8 *pad = dest - tsize; + + /* Already patched? */ + if (!bcmp(pad, skl_call_thunk_template, tsize)) + return pad; + + /* Ensure there are nops */ + if (bcmp(pad, nops, tsize)) { + pr_warn_once("Invalid padding area for %pS\n", dest); + return NULL; + } + + if (direct) + memcpy(pad, skl_call_thunk_template, tsize); + else + text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true); + return pad; +} + +static __init_or_module void patch_call(void *addr, const struct core_text *ct) +{ + void *pad, *dest; + u8 bytes[8]; + + if (!within_coretext(ct, addr)) + return; + + dest = call_get_dest(addr); + if (!dest || WARN_ON_ONCE(IS_ERR(dest))) + return; + + if (!is_coretext(ct, dest)) + return; + + pad = patch_dest(dest, within_coretext(ct, dest)); + if (!pad) + return; + + prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr, + dest, dest, pad); + __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE); + text_poke_early(addr, bytes, CALL_INSN_SIZE); +} + +static __init_or_module void +patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) +{ + s32 *s; + + for (s = start; s < end; s++) + patch_call((void *)s + *s, ct); +} + +static __init_or_module void +patch_paravirt_call_sites(struct paravirt_patch_site *start, + struct paravirt_patch_site *end, + const struct core_text *ct) +{ + struct paravirt_patch_site *p; + + for (p = start; p < end; p++) + patch_call(p->instr, ct); +} + +static __init_or_module void +callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) +{ + prdbg("Patching call sites %s\n", ct->name); + patch_call_sites(cs->call_start, cs->call_end, ct); + patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct); + prdbg("Patching call sites done%s\n", ct->name); +} + +void __init callthunks_patch_builtin_calls(void) +{ + struct callthunk_sites cs = { + .call_start = __call_sites, + .call_end = __call_sites_end, + .pv_start = __parainstructions, + .pv_end = __parainstructions_end + }; + + if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return; + + pr_info("Setting up call depth tracking\n"); + mutex_lock(&text_mutex); + callthunks_setup(&cs, &builtin_coretext); + static_call_force_reinit(); + thunks_initialized = true; + mutex_unlock(&text_mutex); +} + +void *callthunks_translate_call_dest(void *dest) +{ + void *target; + + lockdep_assert_held(&text_mutex); + + if (!thunks_initialized || skip_addr(dest)) + return dest; + + if (!is_coretext(NULL, dest)) + return dest; + + target = patch_dest(dest, false); + return target ? : dest; +} + +bool is_callthunk(void *addr) +{ + unsigned int tmpl_size = SKL_TMPL_SIZE; + void *tmpl = skl_call_thunk_template; + unsigned long dest; + + dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT); + if (!thunks_initialized || skip_addr((void *)dest)) + return false; + + return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size); +} + +#ifdef CONFIG_BPF_JIT +int x86_call_depth_emit_accounting(u8 **pprog, void *func) +{ + unsigned int tmpl_size = SKL_TMPL_SIZE; + void *tmpl = skl_call_thunk_template; + + if (!thunks_initialized) + return 0; + + /* Is function call target a thunk? */ + if (func && is_callthunk(func)) + return 0; + + memcpy(*pprog, tmpl, tmpl_size); + *pprog += tmpl_size; + return tmpl_size; +} +#endif + +#ifdef CONFIG_MODULES +void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, + struct module *mod) +{ + struct core_text ct = { + .base = (unsigned long)mod->core_layout.base, + .end = (unsigned long)mod->core_layout.base + mod->core_layout.size, + .name = mod->name, + }; + + if (!thunks_initialized) + return; + + mutex_lock(&text_mutex); + callthunks_setup(cs, &ct); + mutex_unlock(&text_mutex); +} +#endif /* CONFIG_MODULES */ + +#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS) +static int callthunks_debug_show(struct seq_file *m, void *p) +{ + unsigned long cpu = (unsigned long)m->private; + + seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,", + per_cpu(__x86_call_count, cpu), + per_cpu(__x86_ret_count, cpu), + per_cpu(__x86_stuffs_count, cpu), + per_cpu(__x86_ctxsw_count, cpu)); + return 0; +} + +static int callthunks_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, callthunks_debug_show, inode->i_private); +} + +static const struct file_operations dfs_ops = { + .open = callthunks_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init callthunks_debugfs_init(void) +{ + struct dentry *dir; + unsigned long cpu; + + dir = debugfs_create_dir("callthunks", NULL); + for_each_possible_cpu(cpu) { + void *arg = (void *)cpu; + char name [10]; + + sprintf(name, "cpu%lu", cpu); + debugfs_create_file(name, 0644, dir, arg, &dfs_ops); + } + return 0; +} +__initcall(callthunks_debugfs_init); +#endif diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c new file mode 100644 index 000000000000..8674a5c0c031 --- /dev/null +++ b/arch/x86/kernel/cfi.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Clang Control Flow Integrity (CFI) support. + * + * Copyright (C) 2022 Google LLC + */ +#include <asm/cfi.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> +#include <linux/string.h> + +/* + * Returns the target address and the expected type when regs->ip points + * to a compiler-generated CFI trap. + */ +static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target, + u32 *type) +{ + char buffer[MAX_INSN_SIZE]; + struct insn insn; + int offset = 0; + + *target = *type = 0; + + /* + * The compiler generates the following instruction sequence + * for indirect call checks: + * + * Â movl -<id>, %r10d ; 6 bytes + * addl -4(%reg), %r10d ; 4 bytes + * je .Ltmp1 ; 2 bytes + * ud2 ; <- regs->ip + * .Ltmp1: + * + * We can decode the expected type and the target address from the + * movl/addl instructions. + */ + if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 12, MAX_INSN_SIZE)) + return false; + if (insn_decode_kernel(&insn, &buffer[offset])) + return false; + if (insn.opcode.value != 0xBA) + return false; + + *type = -(u32)insn.immediate.value; + + if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 6, MAX_INSN_SIZE)) + return false; + if (insn_decode_kernel(&insn, &buffer[offset])) + return false; + if (insn.opcode.value != 0x3) + return false; + + /* Read the target address from the register. */ + offset = insn_get_modrm_rm_off(&insn, regs); + if (offset < 0) + return false; + + *target = *(unsigned long *)((void *)regs + offset); + + return true; +} + +/* + * Checks if a ud2 trap is because of a CFI failure, and handles the trap + * if needed. Returns a bug_trap_type value similarly to report_bug. + */ +enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) +{ + unsigned long target; + u32 type; + + if (!is_cfi_trap(regs->ip)) + return BUG_TRAP_TYPE_NONE; + + if (!decode_cfi_insn(regs, &target, &type)) + return report_cfi_failure_noaddr(regs, regs->ip); + + return report_cfi_failure(regs, regs->ip, &target, type); +} + +/* + * Ensure that __kcfi_typeid_ symbols are emitted for functions that may + * not be indirectly called with all configurations. + */ +__ADDRESSABLE(__memcpy) diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 9661e3e802be..d7e3ceaf75c1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,13 +12,11 @@ endif # If these files are instrumented, boot hangs during the first second. KCOV_INSTRUMENT_common.o := n KCOV_INSTRUMENT_perf_event.o := n +KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n -# Make sure load_percpu_segment has no stackprotector -CFLAGS_common.o := -fno-stack-protector - obj-y := cacheinfo.o scattered.o topology.o obj-y += common.o obj-y += rdrand.o diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 23f5f27b5a02..485441b7f030 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -28,6 +28,9 @@ static void __init acrn_init_platform(void) { /* Setup the IDT for ACRN hypervisor callback */ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); + + x86_platform.calibrate_tsc = acrn_get_tsc_khz; + x86_platform.calibrate_cpu = acrn_get_tsc_khz; } static bool acrn_x2apic_available(void) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 48276c0e479d..f769d6d08b43 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -503,7 +503,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; /* A random value per boot for bit slice [12:upper_bit) */ - va_align.bits = get_random_int() & va_align.mask; + va_align.bits = get_random_u32() & va_align.mask; } if (cpu_has(c, X86_FEATURE_MWAITX)) @@ -770,8 +770,6 @@ static void init_amd_gh(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } -#define MSR_AMD64_DE_CFG 0xC0011029 - static void init_amd_ln(struct cpuinfo_x86 *c) { /* @@ -965,8 +963,8 @@ static void init_amd(struct cpuinfo_x86 *c) * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. */ - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); @@ -985,7 +983,7 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ - if (!cpu_has(c, X86_FEATURE_XENPV)) + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); /* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index da7c361f47e0..d970ddb0cc65 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -60,11 +60,18 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); static DEFINE_MUTEX(spec_ctrl_mutex); +/* Update SPEC_CTRL MSR and its cached copy unconditionally */ +static void update_spec_ctrl(u64 val) +{ + this_cpu_write(x86_spec_ctrl_current, val); + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + /* * Keep track of the SPEC_CTRL MSR value for the current task, which may differ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). */ -void write_spec_ctrl_current(u64 val, bool force) +void update_spec_ctrl_cond(u64 val) { if (this_cpu_read(x86_spec_ctrl_current) == val) return; @@ -75,7 +82,7 @@ void write_spec_ctrl_current(u64 val, bool force) * When KERNEL_IBRS this MSR is written on return-to-user, unless * forced the update can be delayed until that time. */ - if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) wrmsrl(MSR_IA32_SPEC_CTRL, val); } @@ -196,22 +203,15 @@ void __init check_bugs(void) } /* - * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is - * done in vmenter.S. + * NOTE: This function is *only* called for SVM, since Intel uses + * MSR_IA32_SPEC_CTRL for SSBD. */ void -x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) +x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest) { - u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current(); + u64 guestval, hostval; struct thread_info *ti = current_thread_info(); - if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { - if (hostval != guestval) { - msrval = setguest ? guestval : hostval; - wrmsrl(MSR_IA32_SPEC_CTRL, msrval); - } - } - /* * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. @@ -787,6 +787,7 @@ enum retbleed_mitigation { RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, RETBLEED_MITIGATION_EIBRS, + RETBLEED_MITIGATION_STUFF, }; enum retbleed_mitigation_cmd { @@ -794,6 +795,7 @@ enum retbleed_mitigation_cmd { RETBLEED_CMD_AUTO, RETBLEED_CMD_UNRET, RETBLEED_CMD_IBPB, + RETBLEED_CMD_STUFF, }; static const char * const retbleed_strings[] = { @@ -802,6 +804,7 @@ static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", + [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing", }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = @@ -831,8 +834,12 @@ static int __init retbleed_parse_cmdline(char *str) retbleed_cmd = RETBLEED_CMD_UNRET; } else if (!strcmp(str, "ibpb")) { retbleed_cmd = RETBLEED_CMD_IBPB; + } else if (!strcmp(str, "stuff")) { + retbleed_cmd = RETBLEED_CMD_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_RETBLEED); } else { pr_err("Ignoring unknown retbleed option (%s).", str); } @@ -879,6 +886,21 @@ static void __init retbleed_select_mitigation(void) } break; + case RETBLEED_CMD_STUFF: + if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) && + spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + + } else { + if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING)) + pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); + else + pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n"); + + goto do_cmd_auto; + } + break; + do_cmd_auto: case RETBLEED_CMD_AUTO: default: @@ -916,6 +938,12 @@ do_cmd_auto: mitigate_smt = true; break; + case RETBLEED_MITIGATION_STUFF: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); + x86_set_skl_return_thunk(); + break; + default: break; } @@ -926,7 +954,7 @@ do_cmd_auto: /* * Let IBRS trump all on Intel without affecting the effects of the - * retbleed= cmdline option. + * retbleed= cmdline option except for call depth based stuffing */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { switch (spectre_v2_enabled) { @@ -939,7 +967,8 @@ do_cmd_auto: retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; break; default: - pr_err(RETBLEED_INTEL_MSG); + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_err(RETBLEED_INTEL_MSG); } } @@ -1302,7 +1331,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) { + if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; @@ -1335,7 +1364,7 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) if (ia32_cap & ARCH_CAP_RRSBA) { x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); } } @@ -1413,6 +1442,7 @@ static void __init spectre_v2_select_mitigation(void) if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && boot_cpu_has_bug(X86_BUG_RETBLEED) && retbleed_cmd != RETBLEED_CMD_OFF && + retbleed_cmd != RETBLEED_CMD_STUFF && boot_cpu_has(X86_FEATURE_IBRS) && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { mode = SPECTRE_V2_IBRS; @@ -1457,7 +1487,7 @@ static void __init spectre_v2_select_mitigation(void) if (spectre_v2_in_ibrs_mode(mode)) { x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); } switch (mode) { @@ -1571,7 +1601,7 @@ static void __init spectre_v2_select_mitigation(void) static void update_stibp_msr(void * __unused) { u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); - write_spec_ctrl_current(val, true); + update_spec_ctrl(val); } /* Update x86_spec_ctrl_base in case SMT state changed. */ @@ -1804,7 +1834,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); } } @@ -2055,7 +2085,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) void x86_spec_ctrl_setup_ap(void) { if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) x86_amd_ssb_disable(); @@ -2206,74 +2236,74 @@ static const char * const l1tf_vmx_states[] = { static ssize_t l1tf_show_state(char *buf) { if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) - return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && sched_smt_active())) { - return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, - l1tf_vmx_states[l1tf_vmx_mitigation]); + return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation]); } - return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, - l1tf_vmx_states[l1tf_vmx_mitigation], - sched_smt_active() ? "vulnerable" : "disabled"); + return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t itlb_multihit_show_state(char *buf) { if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || !boot_cpu_has(X86_FEATURE_VMX)) - return sprintf(buf, "KVM: Mitigation: VMX unsupported\n"); + return sysfs_emit(buf, "KVM: Mitigation: VMX unsupported\n"); else if (!(cr4_read_shadow() & X86_CR4_VMXE)) - return sprintf(buf, "KVM: Mitigation: VMX disabled\n"); + return sysfs_emit(buf, "KVM: Mitigation: VMX disabled\n"); else if (itlb_multihit_kvm_mitigation) - return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n"); else - return sprintf(buf, "KVM: Vulnerable\n"); + return sysfs_emit(buf, "KVM: Vulnerable\n"); } #else static ssize_t l1tf_show_state(char *buf) { - return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); } static ssize_t itlb_multihit_show_state(char *buf) { - return sprintf(buf, "Processor vulnerable\n"); + return sysfs_emit(buf, "Processor vulnerable\n"); } #endif static ssize_t mds_show_state(char *buf) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { - return sprintf(buf, "%s; SMT Host state unknown\n", - mds_strings[mds_mitigation]); + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + mds_strings[mds_mitigation]); } if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { - return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], - (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : - sched_smt_active() ? "mitigated" : "disabled")); + return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : + sched_smt_active() ? "mitigated" : "disabled")); } - return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], - sched_smt_active() ? "vulnerable" : "disabled"); + return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t tsx_async_abort_show_state(char *buf) { if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || (taa_mitigation == TAA_MITIGATION_OFF)) - return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]); if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { - return sprintf(buf, "%s; SMT Host state unknown\n", - taa_strings[taa_mitigation]); + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); } - return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], - sched_smt_active() ? "vulnerable" : "disabled"); + return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t mmio_stale_data_show_state(char *buf) @@ -2341,73 +2371,72 @@ static char *pbrsb_eibrs_state(void) static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) - return sprintf(buf, "Vulnerable: LFENCE\n"); + return sysfs_emit(buf, "Vulnerable: LFENCE\n"); if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) - return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); if (sched_smt_active() && unprivileged_ebpf_enabled() && spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) - return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); + return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); - return sprintf(buf, "%s%s%s%s%s%s%s\n", - spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - pbrsb_eibrs_state(), - spectre_v2_module_string()); + return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + pbrsb_eibrs_state(), + spectre_v2_module_string()); } static ssize_t srbds_show_state(char *buf) { - return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); + return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]); } static ssize_t retbleed_show_state(char *buf) { if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) - return sprintf(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n"); + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return sysfs_emit(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n"); - return sprintf(buf, "%s; SMT %s\n", - retbleed_strings[retbleed_mitigation], - !sched_smt_active() ? "disabled" : - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? - "enabled with STIBP protection" : "vulnerable"); + return sysfs_emit(buf, "%s; SMT %s\n", retbleed_strings[retbleed_mitigation], + !sched_smt_active() ? "disabled" : + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? + "enabled with STIBP protection" : "vulnerable"); } - return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]); + return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]); } static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { if (!boot_cpu_has_bug(bug)) - return sprintf(buf, "Not affected\n"); + return sysfs_emit(buf, "Not affected\n"); switch (bug) { case X86_BUG_CPU_MELTDOWN: if (boot_cpu_has(X86_FEATURE_PTI)) - return sprintf(buf, "Mitigation: PTI\n"); + return sysfs_emit(buf, "Mitigation: PTI\n"); if (hypervisor_is_type(X86_HYPER_XEN_PV)) - return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); + return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); break; case X86_BUG_SPECTRE_V1: - return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); + return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: - return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]); case X86_BUG_L1TF: if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) @@ -2437,7 +2466,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr break; } - return sprintf(buf, "Vulnerable\n"); + return sysfs_emit(buf, "Vulnerable\n"); } ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 66556833d7af..f4e5aa27eec6 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -11,15 +11,19 @@ #include <linux/slab.h> #include <linux/cacheinfo.h> #include <linux/cpu.h> +#include <linux/cpuhotplug.h> #include <linux/sched.h> #include <linux/capability.h> #include <linux/sysfs.h> #include <linux/pci.h> +#include <linux/stop_machine.h> #include <asm/cpufeature.h> #include <asm/cacheinfo.h> #include <asm/amd_nb.h> #include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/tlbflush.h> #include "cpu.h" @@ -35,6 +39,9 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); /* Shared L2 cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); +/* Kernel controls MTRR and/or PAT MSRs. */ +unsigned int memory_caching_control __ro_after_init; + struct _cache_table { unsigned char descriptor; char cache_type; @@ -1040,3 +1047,175 @@ int populate_cache_leaves(unsigned int cpu) return 0; } + +/* + * Disable and enable caches. Needed for changing MTRRs and the PAT MSR. + * + * Since we are disabling the cache don't allow any interrupts, + * they would run extremely slow and would only increase the pain. + * + * The caller must ensure that local interrupts are disabled and + * are reenabled after cache_enable() has been called. + */ +static unsigned long saved_cr4; +static DEFINE_RAW_SPINLOCK(cache_disable_lock); + +void cache_disable(void) __acquires(cache_disable_lock) +{ + unsigned long cr0; + + /* + * Note that this is not ideal + * since the cache is only flushed/disabled for this CPU while the + * MTRRs are changed, but changing this requires more invasive + * changes to the way the kernel boots + */ + + raw_spin_lock(&cache_disable_lock); + + /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ + cr0 = read_cr0() | X86_CR0_CD; + write_cr0(cr0); + + /* + * Cache flushing is the most time-consuming step when programming + * the MTRRs. Fortunately, as per the Intel Software Development + * Manual, we can skip it if the processor supports cache self- + * snooping. + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (cpu_feature_enabled(X86_FEATURE_PGE)) { + saved_cr4 = __read_cr4(); + __write_cr4(saved_cr4 & ~X86_CR4_PGE); + } + + /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + flush_tlb_local(); + + if (cpu_feature_enabled(X86_FEATURE_MTRR)) + mtrr_disable(); + + /* Again, only flush caches if we have to. */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); +} + +void cache_enable(void) __releases(cache_disable_lock) +{ + /* Flush TLBs (no need to flush caches - they are disabled) */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + flush_tlb_local(); + + if (cpu_feature_enabled(X86_FEATURE_MTRR)) + mtrr_enable(); + + /* Enable caches */ + write_cr0(read_cr0() & ~X86_CR0_CD); + + /* Restore value of CR4 */ + if (cpu_feature_enabled(X86_FEATURE_PGE)) + __write_cr4(saved_cr4); + + raw_spin_unlock(&cache_disable_lock); +} + +static void cache_cpu_init(void) +{ + unsigned long flags; + + local_irq_save(flags); + cache_disable(); + + if (memory_caching_control & CACHE_MTRR) + mtrr_generic_set_state(); + + if (memory_caching_control & CACHE_PAT) + pat_cpu_init(); + + cache_enable(); + local_irq_restore(flags); +} + +static bool cache_aps_delayed_init = true; + +void set_cache_aps_delayed_init(bool val) +{ + cache_aps_delayed_init = val; +} + +bool get_cache_aps_delayed_init(void) +{ + return cache_aps_delayed_init; +} + +static int cache_rendezvous_handler(void *unused) +{ + if (get_cache_aps_delayed_init() || !cpu_online(smp_processor_id())) + cache_cpu_init(); + + return 0; +} + +void __init cache_bp_init(void) +{ + mtrr_bp_init(); + pat_bp_init(); + + if (memory_caching_control) + cache_cpu_init(); +} + +void cache_bp_restore(void) +{ + if (memory_caching_control) + cache_cpu_init(); +} + +static int cache_ap_init(unsigned int cpu) +{ + if (!memory_caching_control || get_cache_aps_delayed_init()) + return 0; + + /* + * Ideally we should hold mtrr_mutex here to avoid MTRR entries + * changed, but this routine will be called in CPU boot time, + * holding the lock breaks it. + * + * This routine is called in two cases: + * + * 1. very early time of software resume, when there absolutely + * isn't MTRR entry changes; + * + * 2. CPU hotadd time. We let mtrr_add/del_page hold cpuhotplug + * lock to prevent MTRR entry changes + */ + stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL, + cpu_callout_mask); + + return 0; +} + +/* + * Delayed cache initialization for all AP's + */ +void cache_aps_init(void) +{ + if (!memory_caching_control || !get_cache_aps_delayed_init()) + return; + + stop_machine(cache_rendezvous_handler, NULL, cpu_online_mask); + set_cache_aps_delayed_init(false); +} + +static int __init cache_ap_register(void) +{ + cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING, + "x86/cachectrl:starting", + cache_ap_init, NULL); + return 0; +} +core_initcall(cache_ap_register); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3e508f239098..9cfca3d7d0e2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -22,9 +22,9 @@ #include <linux/io.h> #include <linux/syscore_ops.h> #include <linux/pgtable.h> +#include <linux/stackprotector.h> #include <asm/cmdline.h> -#include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/doublefault.h> @@ -52,6 +52,7 @@ #include <asm/cpu.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/cacheinfo.h> #include <asm/memtype.h> #include <asm/microcode.h> #include <asm/microcode_intel.h> @@ -609,6 +610,7 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c) if (!ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); + wrmsrl(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); return; } @@ -701,16 +703,6 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); __u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); -void load_percpu_segment(int cpu) -{ -#ifdef CONFIG_X86_32 - loadsegment(fs, __KERNEL_PERCPU); -#else - __loadsegment_simple(gs, 0); - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); -#endif -} - #ifdef CONFIG_X86_32 /* The 32-bit entry code needs to find cpu_entry_area. */ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); @@ -738,16 +730,45 @@ void load_fixmap_gdt(int cpu) } EXPORT_SYMBOL_GPL(load_fixmap_gdt); -/* - * Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. +/** + * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base + * @cpu: The CPU number for which this is invoked + * + * Invoked during early boot to switch from early GDT and early per CPU to + * the direct GDT and the runtime per CPU area. On 32-bit the percpu base + * switch is implicit by loading the direct GDT. On 64bit this requires + * to update GSBASE. */ -void switch_to_new_gdt(int cpu) +void __init switch_gdt_and_percpu_base(int cpu) { - /* Load the original GDT */ load_direct_gdt(cpu); - /* Reload the per-cpu base */ - load_percpu_segment(cpu); + +#ifdef CONFIG_X86_64 + /* + * No need to load %gs. It is already correct. + * + * Writing %gs on 64bit would zero GSBASE which would make any per + * CPU operation up to the point of the wrmsrl() fault. + * + * Set GSBASE to the new offset. Until the wrmsrl() happens the + * early mapping is still valid. That means the GSBASE update will + * lose any prior per CPU data which was not copied over in + * setup_per_cpu_areas(). + * + * This works even with stackprotector enabled because the + * per CPU stack canary is 0 in both per CPU areas. + */ + wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); +#else + /* + * %fs is already set to __KERNEL_PERCPU, but after switching GDT + * it is required to load FS again so that the 'hidden' part is + * updated from the new GDT. Up to this point the early per CPU + * translation is active. Any content of the early per CPU data + * which was not copied over in setup_per_cpu_areas() is lost. + */ + loadsegment(fs, __KERNEL_PERCPU); +#endif } static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; @@ -1948,7 +1969,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_32 enable_sep_cpu(); #endif - mtrr_ap_init(); validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); @@ -1993,27 +2013,18 @@ static __init int setup_clearcpuid(char *arg) } __setup("clearcpuid=", setup_clearcpuid); +DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { + .current_task = &init_task, + .preempt_count = INIT_PREEMPT_COUNT, + .top_of_stack = TOP_OF_INIT_STACK, +}; +EXPORT_PER_CPU_SYMBOL(pcpu_hot); + #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); -/* - * The following percpu variables are hot. Align current_task to - * cacheline size such that they fall in the same cacheline. - */ -DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = - &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - -DEFINE_PER_CPU(void *, hardirq_stack_ptr); -DEFINE_PER_CPU(bool, hardirq_stack_inuse); - -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; - static void wrmsrl_cstar(unsigned long val) { /* @@ -2064,20 +2075,6 @@ void syscall_init(void) #else /* CONFIG_X86_64 */ -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - -/* - * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find - * the top of the kernel stack. Use an extra percpu variable to track the - * top of the kernel stack directly. - */ -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = - (unsigned long)&init_thread_union + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); - #ifdef CONFIG_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, __stack_chk_guard); EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); @@ -2248,12 +2245,6 @@ void cpu_init(void) boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - switch_to_new_gdt(cpu); - if (IS_ENABLED(CONFIG_X86_64)) { loadsegment(fs, 0); memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index c881bcafba7d..d95221117129 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,6 +75,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 }, { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 993697e71854..03851240c3e3 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/tboot.h> +#include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/msr-index.h> #include <asm/processor.h> #include <asm/vmx.h> -#include "cpu.h" #undef pr_fmt #define pr_fmt(fmt) "x86/cpu: " fmt diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 21fd425088fe..5a2962c492d3 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -326,8 +326,8 @@ static void init_hygon(struct cpuinfo_x86 *c) * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. */ - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); @@ -339,7 +339,7 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARAT); /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */ - if (!cpu_has(c, X86_FEATURE_XENPV)) + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); check_null_seg_clears_base(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 2d7ea5480ec3..291d4167fab8 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -210,12 +210,154 @@ int intel_cpu_collect_info(struct ucode_cpu_info *uci) csig.rev = intel_get_microcode_revision(); uci->cpu_sig = csig; - uci->valid = 1; return 0; } EXPORT_SYMBOL_GPL(intel_cpu_collect_info); +/* + * Returns 1 if update has been found, 0 otherwise. + */ +int intel_find_matching_signature(void *mc, unsigned int csig, int cpf) +{ + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; + struct extended_signature *ext_sig; + int i; + + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + return 1; + + /* Look for ext. headers: */ + if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) + return 0; + + ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; + + for (i = 0; i < ext_hdr->count; i++) { + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} +EXPORT_SYMBOL_GPL(intel_find_matching_signature); + +/** + * intel_microcode_sanity_check() - Sanity check microcode file. + * @mc: Pointer to the microcode file contents. + * @print_err: Display failure reason if true, silent if false. + * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file. + * Validate if the microcode header type matches with the type + * specified here. + * + * Validate certain header fields and verify if computed checksum matches + * with the one specified in the header. + * + * Return: 0 if the file passes all the checks, -EINVAL if any of the checks + * fail. + */ +int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type) +{ + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + u32 sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("Error: bad microcode data file size.\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) { + if (print_err) + pr_err("Error: invalid/unknown microcode update format. Header type %d\n", + mc_header->hdrver); + return -EINVAL; + } + + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; + + if (ext_table_size < EXT_HEADER_SIZE || + ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("Error: truncated extended signature table.\n"); + return -EINVAL; + } + + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("Error: extended signature table size mismatch.\n"); + return -EFAULT; + } + + ext_sigcount = ext_header->count; + + /* + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. + */ + ext_tablep = (u32 *)ext_header; + + i = ext_table_size / sizeof(u32); + while (i--) + ext_table_sum += ext_tablep[i]; + + if (ext_table_sum) { + if (print_err) + pr_warn("Bad extended signature table checksum, aborting.\n"); + return -EINVAL; + } + } + + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); + while (i--) + orig_sum += ((u32 *)mc)[i]; + + if (orig_sum) { + if (print_err) + pr_err("Bad microcode data checksum, aborting.\n"); + return -EINVAL; + } + + if (!ext_table_size) + return 0; + + /* + * Check extended signature checksum: 0 => valid. + */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("Bad extended signature checksum, aborting.\n"); + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(intel_microcode_sanity_check); + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -1034,8 +1176,32 @@ static const struct { static struct ratelimit_state bld_ratelimit; +static unsigned int sysctl_sld_mitigate = 1; static DEFINE_SEMAPHORE(buslock_sem); +#ifdef CONFIG_PROC_SYSCTL +static struct ctl_table sld_sysctls[] = { + { + .procname = "split_lock_mitigate", + .data = &sysctl_sld_mitigate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sld_mitigate_sysctl_init(void) +{ + register_sysctl_init("kernel", sld_sysctls); + return 0; +} + +late_initcall(sld_mitigate_sysctl_init); +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt), ratelimit; @@ -1146,12 +1312,20 @@ static void split_lock_init(void) split_lock_verify_msr(sld_state != sld_off); } -static void __split_lock_reenable(struct work_struct *work) +static void __split_lock_reenable_unlock(struct work_struct *work) { sld_update_msr(true); up(&buslock_sem); } +static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); + +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); +} +static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); + /* * If a CPU goes offline with pending delayed work to re-enable split lock * detection then the delayed work will be executed on some other CPU. That @@ -1169,10 +1343,9 @@ static int splitlock_cpu_offline(unsigned int cpu) return 0; } -static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable); - static void split_lock_warn(unsigned long ip) { + struct delayed_work *work; int cpu; if (!current->reported_split_lock) @@ -1180,14 +1353,26 @@ static void split_lock_warn(unsigned long ip) current->comm, current->pid, ip); current->reported_split_lock = 1; - /* misery factor #1, sleep 10ms before trying to execute split lock */ - if (msleep_interruptible(10) > 0) - return; - /* Misery factor #2, only allow one buslocked disabled core at a time */ - if (down_interruptible(&buslock_sem) == -EINTR) - return; + if (sysctl_sld_mitigate) { + /* + * misery factor #1: + * sleep 10ms before trying to execute split lock. + */ + if (msleep_interruptible(10) > 0) + return; + /* + * Misery factor #2: + * only allow one buslocked disabled core at a time. + */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + work = &sl_reenable_unlock; + } else { + work = &sl_reenable; + } + cpu = get_cpu(); - schedule_delayed_work_on(cpu, &split_lock_reenable, 2); + schedule_delayed_work_on(cpu, work, 2); /* Disable split lock detection on this CPU to make progress */ sld_update_msr(false); diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index fbaf12e43f41..3b8476158236 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -204,7 +204,12 @@ static int intel_epb_offline(unsigned int cpu) } static const struct x86_cpu_id intel_epb_normal[] = { - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 7), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), {} }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 1c87501e0fa3..10fb5b5c9efa 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -788,6 +788,24 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) return status & MCI_STATUS_DEFERRED; } +static bool _log_error_deferred(unsigned int bank, u32 misc) +{ + if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), + mca_msr_reg(bank, MCA_ADDR), misc)) + return false; + + /* + * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers. + * Return true here to avoid accessing these registers. + */ + if (!mce_flags.smca) + return true; + + /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return true; +} + /* * We have three scenarios for checking for Deferred errors: * @@ -799,19 +817,8 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) */ static void log_error_deferred(unsigned int bank) { - bool defrd; - - defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), - mca_msr_reg(bank, MCA_ADDR), 0); - - if (!mce_flags.smca) - return; - - /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ - if (defrd) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + if (_log_error_deferred(bank, 0)) return; - } /* * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check @@ -832,7 +839,7 @@ static void amd_deferred_error_interrupt(void) static void log_error_thresholding(unsigned int bank, u64 misc) { - _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc); + _log_error_deferred(bank, misc); } static void log_and_reset_block(struct threshold_block *block) diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 717192915f28..8ed341714686 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -29,15 +29,26 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { struct mce m; + int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; + /* + * Even if the ->validation_bits are set for address mask, + * to be extra safe, check and reject an error radius '0', + * and fall back to the default page size. + */ + if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) + lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT); + else + lsb = PAGE_SHIFT; + mce_setup(&m); m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; - m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT; + m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) m.status |= MCI_STATUS_UC; diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 00483d1c27e4..c4477162c07d 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -203,6 +203,11 @@ static struct severity { BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) ), MCESEV( + PANIC, "Uncorrected in kernel", + BITSET(MCI_STATUS_UC), + KERNEL + ), + MCESEV( UC, "Uncorrected", BITSET(MCI_STATUS_UC) ), @@ -391,9 +396,6 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char *msg = s->msg; s->covered = 1; - if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) - return MCE_PANIC_SEVERITY; - return s->sev; } } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 8b2fcdfa6d31..56471f750762 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -440,7 +440,13 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p return ret; native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev >= mc->hdr.patch_id) + + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (rev > mc->hdr.patch_id) return ret; if (!__apply_microcode_amd(mc)) { @@ -528,8 +534,12 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* Check whether we have saved a new patch already: */ - if (*new_rev && rev < mc->hdr.patch_id) { + /* + * Check whether a new patch has been saved already. Also, allow application of + * the same revision in order to pick up SMT-thread-specific configuration even + * if the sibling SMT thread already has an up-to-date revision. + */ + if (*new_rev && rev <= mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { *new_rev = mc->hdr.patch_id; return; @@ -788,6 +798,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, kfree(patch); return -EINVAL; } + patch->size = *patch_size; mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); proc_id = mc_hdr->processor_rev_id; @@ -869,7 +880,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) return ret; memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); - memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE)); + memcpy(amd_ucode_patch, p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); return ret; } @@ -890,8 +901,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) * * These might be larger than 2K. */ -static enum ucode_state request_microcode_amd(int cpu, struct device *device, - bool refresh_fw) +static enum ucode_state request_microcode_amd(int cpu, struct device *device) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -900,7 +910,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, const struct firmware *fw; /* reload ucode container only on the boot cpu */ - if (!refresh_fw || !bsp) + if (!bsp) return UCODE_OK; if (c->x86 >= 0x15) @@ -924,12 +934,6 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, return ret; } -static enum ucode_state -request_microcode_user(int cpu, const void __user *buf, size_t size) -{ - return UCODE_ERROR; -} - static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -938,7 +942,6 @@ static void microcode_fini_cpu_amd(int cpu) } static struct microcode_ops microcode_amd_ops = { - .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index ad57e0e4d674..712aafff96e0 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -319,60 +319,6 @@ void reload_early_microcode(void) } } -static void collect_cpu_info_local(void *arg) -{ - struct cpu_info_ctx *ctx = arg; - - ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), - ctx->cpu_sig); -} - -static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) -{ - struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; - int ret; - - ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); - if (!ret) - ret = ctx.err; - - return ret; -} - -static int collect_cpu_info(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - int ret; - - memset(uci, 0, sizeof(*uci)); - - ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); - if (!ret) - uci->valid = 1; - - return ret; -} - -static void apply_microcode_local(void *arg) -{ - enum ucode_state *err = arg; - - *err = microcode_ops->apply_microcode(smp_processor_id()); -} - -static int apply_microcode_on_target(int cpu) -{ - enum ucode_state err; - int ret; - - ret = smp_call_function_single(cpu, apply_microcode_local, &err, 1); - if (!ret) { - if (err == UCODE_ERROR) - ret = 1; - } - return ret; -} - /* fake device for request_firmware */ static struct platform_device *microcode_pdev; @@ -458,7 +404,7 @@ static int __reload_late(void *info) * below. */ if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu) - apply_microcode_local(&err); + err = microcode_ops->apply_microcode(cpu); else goto wait_for_siblings; @@ -480,7 +426,7 @@ wait_for_siblings: * revision. */ if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu) - apply_microcode_local(&err); + err = microcode_ops->apply_microcode(cpu); return ret; } @@ -491,7 +437,7 @@ wait_for_siblings: */ static int microcode_reload_late(void) { - int ret; + int old = boot_cpu_data.microcode, ret; pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n"); pr_err("You should switch to early loading, if possible.\n"); @@ -503,7 +449,8 @@ static int microcode_reload_late(void) if (ret == 0) microcode_check(); - pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode); + pr_info("Reload completed, microcode revision: 0x%x -> 0x%x\n", + old, boot_cpu_data.microcode); return ret; } @@ -530,7 +477,7 @@ static ssize_t reload_store(struct device *dev, if (ret) goto put; - tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); + tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev); if (tmp_ret != UCODE_NEW) goto put; @@ -588,91 +535,17 @@ static void microcode_fini_cpu(int cpu) microcode_ops->microcode_fini_cpu(cpu); } -static enum ucode_state microcode_resume_cpu(int cpu) +static enum ucode_state microcode_init_cpu(int cpu) { - if (apply_microcode_on_target(cpu)) - return UCODE_ERROR; - - pr_debug("CPU%d updated upon resume\n", cpu); - - return UCODE_OK; -} - -static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) -{ - enum ucode_state ustate; struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (uci->valid) - return UCODE_OK; - - if (collect_cpu_info(cpu)) - return UCODE_ERROR; - - /* --dimm. Trigger a delayed update? */ - if (system_state != SYSTEM_RUNNING) - return UCODE_NFOUND; - - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, refresh_fw); - if (ustate == UCODE_NEW) { - pr_debug("CPU%d updated upon init\n", cpu); - apply_microcode_on_target(cpu); - } - - return ustate; -} - -static enum ucode_state microcode_update_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - /* Refresh CPU microcode revision after resume. */ - collect_cpu_info(cpu); - - if (uci->valid) - return microcode_resume_cpu(cpu); - - return microcode_init_cpu(cpu, false); -} - -static int mc_device_add(struct device *dev, struct subsys_interface *sif) -{ - int err, cpu = dev->id; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("CPU%d added\n", cpu); - - err = sysfs_create_group(&dev->kobj, &mc_attr_group); - if (err) - return err; + memset(uci, 0, sizeof(*uci)); - if (microcode_init_cpu(cpu, true) == UCODE_ERROR) - return -EINVAL; + microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig); - return err; + return microcode_ops->apply_microcode(cpu); } -static void mc_device_remove(struct device *dev, struct subsys_interface *sif) -{ - int cpu = dev->id; - - if (!cpu_online(cpu)) - return; - - pr_debug("CPU%d removed\n", cpu); - microcode_fini_cpu(cpu); - sysfs_remove_group(&dev->kobj, &mc_attr_group); -} - -static struct subsys_interface mc_cpu_interface = { - .name = "microcode", - .subsys = &cpu_subsys, - .add_dev = mc_device_add, - .remove_dev = mc_device_remove, -}; - /** * microcode_bsp_resume - Update boot CPU microcode during resume. */ @@ -681,21 +554,23 @@ void microcode_bsp_resume(void) int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (uci->valid && uci->mc) + if (uci->mc) microcode_ops->apply_microcode(cpu); - else if (!uci->mc) + else reload_early_microcode(); } static struct syscore_ops mc_syscore_ops = { - .resume = microcode_bsp_resume, + .resume = microcode_bsp_resume, }; static int mc_cpu_starting(unsigned int cpu) { - microcode_update_cpu(cpu); - pr_debug("CPU%d added\n", cpu); - return 0; + enum ucode_state err = microcode_ops->apply_microcode(cpu); + + pr_debug("%s: CPU%d, err: %d\n", __func__, cpu, err); + + return err == UCODE_ERROR; } static int mc_cpu_online(unsigned int cpu) @@ -712,13 +587,30 @@ static int mc_cpu_down_prep(unsigned int cpu) struct device *dev; dev = get_cpu_device(cpu); + + microcode_fini_cpu(cpu); + /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&dev->kobj, &mc_attr_group); - pr_debug("CPU%d removed\n", cpu); + pr_debug("%s: CPU%d\n", __func__, cpu); return 0; } +static void setup_online_cpu(struct work_struct *work) +{ + int cpu = smp_processor_id(); + enum ucode_state err; + + err = microcode_init_cpu(cpu); + if (err == UCODE_ERROR) { + pr_err("Error applying microcode on CPU%d\n", cpu); + return; + } + + mc_cpu_online(cpu); +} + static struct attribute *cpu_root_microcode_attrs[] = { #ifdef CONFIG_MICROCODE_LATE_LOADING &dev_attr_reload.attr, @@ -749,28 +641,19 @@ static int __init microcode_init(void) if (!microcode_ops) return -ENODEV; - microcode_pdev = platform_device_register_simple("microcode", -1, - NULL, 0); + microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) return PTR_ERR(microcode_pdev); - cpus_read_lock(); - mutex_lock(µcode_mutex); - error = subsys_interface_register(&mc_cpu_interface); - mutex_unlock(µcode_mutex); - cpus_read_unlock(); - - if (error) - goto out_pdev; - - error = sysfs_create_group(&cpu_subsys.dev_root->kobj, - &cpu_root_microcode_group); - + error = sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpu_root_microcode_group); if (error) { pr_err("Error creating microcode group!\n"); - goto out_driver; + goto out_pdev; } + /* Do per-CPU setup */ + schedule_on_each_cpu(setup_online_cpu); + register_syscore_ops(&mc_syscore_ops); cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting", mc_cpu_starting, NULL); @@ -781,15 +664,6 @@ static int __init microcode_init(void) return 0; - out_driver: - cpus_read_lock(); - mutex_lock(µcode_mutex); - - subsys_interface_unregister(&mc_cpu_interface); - - mutex_unlock(µcode_mutex); - cpus_read_unlock(); - out_pdev: platform_device_unregister(microcode_pdev); return error; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 025c8f0cd948..cac2bdb57f0b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -48,34 +48,6 @@ static int llc_size_per_core; /* * Returns 1 if update has been found, 0 otherwise. */ -static int find_matching_signature(void *mc, unsigned int csig, int cpf) -{ - struct microcode_header_intel *mc_hdr = mc; - struct extended_sigtable *ext_hdr; - struct extended_signature *ext_sig; - int i; - - if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) - return 1; - - /* Look for ext. headers: */ - if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) - return 0; - - ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; - ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - - for (i = 0; i < ext_hdr->count; i++) { - if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) - return 1; - ext_sig++; - } - return 0; -} - -/* - * Returns 1 if update has been found, 0 otherwise. - */ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) { struct microcode_header_intel *mc_hdr = mc; @@ -83,7 +55,7 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev if (mc_hdr->rev <= new_rev) return 0; - return find_matching_signature(mc, csig, cpf); + return intel_find_matching_signature(mc, csig, cpf); } static struct ucode_patch *memdup_patch(void *data, unsigned int size) @@ -117,7 +89,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne sig = mc_saved_hdr->sig; pf = mc_saved_hdr->pf; - if (find_matching_signature(data, sig, pf)) { + if (intel_find_matching_signature(data, sig, pf)) { prev_found = true; if (mc_hdr->rev <= mc_saved_hdr->rev) @@ -149,7 +121,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne if (!p) return; - if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf)) + if (!intel_find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf)) return; /* @@ -163,104 +135,6 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne intel_ucode_patch = p->data; } -static int microcode_sanity_check(void *mc, int print_err) -{ - unsigned long total_size, data_size, ext_table_size; - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - u32 sum, orig_sum, ext_sigcount = 0, i; - struct extended_signature *ext_sig; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - - if (data_size + MC_HEADER_SIZE > total_size) { - if (print_err) - pr_err("Error: bad microcode data file size.\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - if (print_err) - pr_err("Error: invalid/unknown microcode update format.\n"); - return -EINVAL; - } - - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - u32 ext_table_sum = 0; - u32 *ext_tablep; - - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - if (print_err) - pr_err("Error: truncated extended signature table.\n"); - return -EINVAL; - } - - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - if (print_err) - pr_err("Error: extended signature table size mismatch.\n"); - return -EFAULT; - } - - ext_sigcount = ext_header->count; - - /* - * Check extended table checksum: the sum of all dwords that - * comprise a valid table must be 0. - */ - ext_tablep = (u32 *)ext_header; - - i = ext_table_size / sizeof(u32); - while (i--) - ext_table_sum += ext_tablep[i]; - - if (ext_table_sum) { - if (print_err) - pr_warn("Bad extended signature table checksum, aborting.\n"); - return -EINVAL; - } - } - - /* - * Calculate the checksum of update data and header. The checksum of - * valid update data and header including the extended signature table - * must be 0. - */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / sizeof(u32); - while (i--) - orig_sum += ((u32 *)mc)[i]; - - if (orig_sum) { - if (print_err) - pr_err("Bad microcode data checksum, aborting.\n"); - return -EINVAL; - } - - if (!ext_table_size) - return 0; - - /* - * Check extended signature checksum: 0 => valid. - */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - - sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - - (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - if (print_err) - pr_err("Bad extended signature checksum, aborting.\n"); - return -EINVAL; - } - } - return 0; -} - /* * Get microcode matching with BSP's model. Only CPUs with the same model as * BSP can stay in the platform. @@ -281,13 +155,13 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) mc_size = get_totalsize(mc_header); if (!mc_size || mc_size > size || - microcode_sanity_check(data, 0) < 0) + intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0) break; size -= mc_size; - if (!find_matching_signature(data, uci->cpu_sig.sig, - uci->cpu_sig.pf)) { + if (!intel_find_matching_signature(data, uci->cpu_sig.sig, + uci->cpu_sig.pf)) { data += mc_size; continue; } @@ -621,7 +495,6 @@ void load_ucode_intel_ap(void) else iup = &intel_ucode_patch; -reget: if (!*iup) { patch = __load_ucode_intel(&uci); if (!patch) @@ -632,12 +505,7 @@ reget: uci.mc = *iup; - if (apply_microcode_early(&uci, true)) { - /* Mixed-silicon system? Try to refetch the proper patch: */ - *iup = NULL; - - goto reget; - } + apply_microcode_early(&uci, true); } static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) @@ -652,9 +520,9 @@ static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) if (phdr->rev <= uci->cpu_sig.rev) continue; - if (!find_matching_signature(phdr, - uci->cpu_sig.sig, - uci->cpu_sig.pf)) + if (!intel_find_matching_signature(phdr, + uci->cpu_sig.sig, + uci->cpu_sig.pf)) continue; return iter->data; @@ -680,7 +548,6 @@ void reload_ucode_intel(void) static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { - static struct cpu_signature prev; struct cpuinfo_x86 *c = &cpu_data(cpu_num); unsigned int val[2]; @@ -696,13 +563,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->rev = c->microcode; - /* No extra locking on prev, races are harmless. */ - if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) { - pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n", - csig->sig, csig->pf, csig->rev); - prev = *csig; - } - return 0; } @@ -820,7 +680,7 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) memcpy(mc, &mc_header, sizeof(mc_header)); data = mc + sizeof(mc_header); if (!copy_from_iter_full(data, data_size, iter) || - microcode_sanity_check(mc, 1) < 0) { + intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0) { break; } @@ -885,8 +745,7 @@ static bool is_blacklisted(unsigned int cpu) return false; } -static enum ucode_state request_microcode_fw(int cpu, struct device *device, - bool refresh_fw) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; @@ -908,7 +767,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, kvec.iov_base = (void *)firmware->data; kvec.iov_len = firmware->size; - iov_iter_kvec(&iter, WRITE, &kvec, 1, firmware->size); + iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size); ret = generic_load_microcode(cpu, &iter); release_firmware(firmware); @@ -916,24 +775,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return ret; } -static enum ucode_state -request_microcode_user(int cpu, const void __user *buf, size_t size) -{ - struct iov_iter iter; - struct iovec iov; - - if (is_blacklisted(cpu)) - return UCODE_NFOUND; - - iov.iov_base = (void __user *)buf; - iov.iov_len = size; - iov_iter_init(&iter, WRITE, &iov, 1, size); - - return generic_load_microcode(cpu, &iter); -} - static struct microcode_ops microcode_intel_ops = { - .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode_intel, diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 831613959a92..46668e255421 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -475,6 +475,12 @@ static bool __init ms_hyperv_x2apic_available(void) * (logically) generates MSIs directly to the system APIC irq domain. * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the * pci-hyperv host bridge. + * + * Note: for a Hyper-V root partition, this will always return false. + * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by + * default, they are implemented as intercepts by the Windows Hyper-V stack. + * Even a nested root partition (L2 root) will not get them because the + * nested (L1) hypervisor filters them out. */ static bool __init ms_hyperv_msi_ext_dest_id(void) { diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index a65a0272096d..eff6ac62c0ff 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -109,7 +109,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) return 0; } -static const struct mtrr_ops amd_mtrr_ops = { +const struct mtrr_ops amd_mtrr_ops = { .vendor = X86_VENDOR_AMD, .set = amd_set_mtrr, .get = amd_get_mtrr, @@ -117,9 +117,3 @@ static const struct mtrr_ops amd_mtrr_ops = { .validate_add_page = amd_validate_add_page, .have_wrcomb = positive_have_wrcomb, }; - -int __init amd_init_mtrr(void) -{ - set_mtrr_ops(&amd_mtrr_ops); - return 0; -} diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index f27177816569..b8a74eddde83 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -111,7 +111,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t return 0; } -static const struct mtrr_ops centaur_mtrr_ops = { +const struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, .set = centaur_set_mcr, .get = centaur_get_mcr, @@ -119,9 +119,3 @@ static const struct mtrr_ops centaur_mtrr_ops = { .validate_add_page = centaur_validate_add_page, .have_wrcomb = positive_have_wrcomb, }; - -int __init centaur_init_mtrr(void) -{ - set_mtrr_ops(¢aur_mtrr_ops); - return 0; -} diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index ca670919b561..173b9e01e623 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -234,51 +234,11 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, post_set(); } -typedef struct { - unsigned long base; - unsigned long size; - mtrr_type type; -} arr_state_t; - -static arr_state_t arr_state[8] = { - {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, - {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL} -}; - -static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 }; - -static void cyrix_set_all(void) -{ - int i; - - prepare_set(); - - /* the CCRs are not contiguous */ - for (i = 0; i < 4; i++) - setCx86(CX86_CCR0 + i, ccr_state[i]); - for (; i < 7; i++) - setCx86(CX86_CCR4 + i, ccr_state[i]); - - for (i = 0; i < 8; i++) { - cyrix_set_arr(i, arr_state[i].base, - arr_state[i].size, arr_state[i].type); - } - - post_set(); -} - -static const struct mtrr_ops cyrix_mtrr_ops = { +const struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, - .set_all = cyrix_set_all, .set = cyrix_set_arr, .get = cyrix_get_arr, .get_free_region = cyrix_get_free_region, .validate_add_page = generic_validate_add_page, .have_wrcomb = positive_have_wrcomb, }; - -int __init cyrix_init_mtrr(void) -{ - set_mtrr_ops(&cyrix_mtrr_ops); - return 0; -} diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 558108296f3c..ee09d359e08f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <asm/processor-flags.h> +#include <asm/cacheinfo.h> #include <asm/cpufeature.h> #include <asm/tlbflush.h> #include <asm/mtrr.h> @@ -396,9 +397,6 @@ print_fixed(unsigned base, unsigned step, const mtrr_type *types) } } -static void prepare_set(void); -static void post_set(void); - static void __init print_mtrr_state(void) { unsigned int i; @@ -444,20 +442,6 @@ static void __init print_mtrr_state(void) pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } -/* PAT setup for BP. We need to go through sync steps here */ -void __init mtrr_bp_pat_init(void) -{ - unsigned long flags; - - local_irq_save(flags); - prepare_set(); - - pat_init(); - - post_set(); - local_irq_restore(flags); -} - /* Grab all of the MTRR state for this CPU into *state */ bool __init get_mtrr_state(void) { @@ -684,7 +668,10 @@ static u32 deftype_lo, deftype_hi; /** * set_mtrr_state - Set the MTRR state for this CPU. * - * NOTE: The CPU must already be in a safe state for MTRR changes. + * NOTE: The CPU must already be in a safe state for MTRR changes, including + * measures that only a single CPU can be active in set_mtrr_state() in + * order to not be subject to races for usage of deftype_lo. This is + * accomplished by taking cache_disable_lock. * RETURNS: 0 if no changes made, else a mask indicating what was changed. */ static unsigned long set_mtrr_state(void) @@ -715,106 +702,34 @@ static unsigned long set_mtrr_state(void) return change_mask; } - -static unsigned long cr4; -static DEFINE_RAW_SPINLOCK(set_atomicity_lock); - -/* - * Since we are disabling the cache don't allow any interrupts, - * they would run extremely slow and would only increase the pain. - * - * The caller must ensure that local interrupts are disabled and - * are reenabled after post_set() has been called. - */ -static void prepare_set(void) __acquires(set_atomicity_lock) +void mtrr_disable(void) { - unsigned long cr0; - - /* - * Note that this is not ideal - * since the cache is only flushed/disabled for this CPU while the - * MTRRs are changed, but changing this requires more invasive - * changes to the way the kernel boots - */ - - raw_spin_lock(&set_atomicity_lock); - - /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ - cr0 = read_cr0() | X86_CR0_CD; - write_cr0(cr0); - - /* - * Cache flushing is the most time-consuming step when programming - * the MTRRs. Fortunately, as per the Intel Software Development - * Manual, we can skip it if the processor supports cache self- - * snooping. - */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); - - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (boot_cpu_has(X86_FEATURE_PGE)) { - cr4 = __read_cr4(); - __write_cr4(cr4 & ~X86_CR4_PGE); - } - - /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - flush_tlb_local(); - /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); - - /* Again, only flush caches if we have to. */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); } -static void post_set(void) __releases(set_atomicity_lock) +void mtrr_enable(void) { - /* Flush TLBs (no need to flush caches - they are disabled) */ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - flush_tlb_local(); - /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - - /* Enable caches */ - write_cr0(read_cr0() & ~X86_CR0_CD); - - /* Restore value of CR4 */ - if (boot_cpu_has(X86_FEATURE_PGE)) - __write_cr4(cr4); - raw_spin_unlock(&set_atomicity_lock); } -static void generic_set_all(void) +void mtrr_generic_set_state(void) { unsigned long mask, count; - unsigned long flags; - - local_irq_save(flags); - prepare_set(); /* Actually set the state */ mask = set_mtrr_state(); - /* also set PAT */ - pat_init(); - - post_set(); - local_irq_restore(flags); - /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof(mask) * 8; ++count) { if (mask & 0x01) set_bit(count, &smp_changes_mask); mask >>= 1; } - } /** @@ -836,7 +751,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, vr = &mtrr_state.var_ranges[reg]; local_irq_save(flags); - prepare_set(); + cache_disable(); if (size == 0) { /* @@ -855,7 +770,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi); } - post_set(); + cache_enable(); local_irq_restore(flags); } @@ -914,8 +829,6 @@ int positive_have_wrcomb(void) * Generic structure... */ const struct mtrr_ops generic_mtrr_ops = { - .use_intel_if = 1, - .set_all = generic_set_all, .get = generic_get_mtrr, .get_free_region = generic_get_free_region, .set = generic_set_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 2746cac9d8a9..783f3210d582 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -46,6 +46,7 @@ #include <linux/syscore_ops.h> #include <linux/rcupdate.h> +#include <asm/cacheinfo.h> #include <asm/cpufeature.h> #include <asm/e820/api.h> #include <asm/mtrr.h> @@ -58,32 +59,18 @@ #define MTRR_TO_PHYS_WC_OFFSET 1000 u32 num_var_ranges; -static bool __mtrr_enabled; - static bool mtrr_enabled(void) { - return __mtrr_enabled; + return !!mtrr_if; } unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; -static bool mtrr_aps_delayed_init; - -static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init; const struct mtrr_ops *mtrr_if; -static void set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type); - -void __init set_mtrr_ops(const struct mtrr_ops *ops) -{ - if (ops->vendor && ops->vendor < X86_VENDOR_NUM) - mtrr_ops[ops->vendor] = ops; -} - /* Returns non-zero if we have the write-combining memory type */ static int have_wrcomb(void) { @@ -119,11 +106,11 @@ static int have_wrcomb(void) } /* This function returns the number of variable MTRRs */ -static void __init set_num_var_ranges(void) +static void __init set_num_var_ranges(bool use_generic) { unsigned long config = 0, dummy; - if (use_intel()) + if (use_generic) rdmsr(MSR_MTRRcap, config, dummy); else if (is_cpu(AMD) || is_cpu(HYGON)) config = 2; @@ -160,25 +147,8 @@ static int mtrr_rendezvous_handler(void *info) { struct set_mtrr_data *data = info; - /* - * We use this same function to initialize the mtrrs during boot, - * resume, runtime cpu online and on an explicit request to set a - * specific MTRR. - * - * During boot or suspend, the state of the boot cpu's mtrrs has been - * saved, and we want to replicate that across all the cpus that come - * online (either at the end of boot or resume or during a runtime cpu - * online). If we're doing that, @reg is set to something special and on - * all the cpu's we do mtrr_if->set_all() (On the logical cpu that - * started the boot/resume sequence, this might be a duplicate - * set_all()). - */ - if (data->smp_reg != ~0U) { - mtrr_if->set(data->smp_reg, data->smp_base, - data->smp_size, data->smp_type); - } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { - mtrr_if->set_all(); - } + mtrr_if->set(data->smp_reg, data->smp_base, + data->smp_size, data->smp_type); return 0; } @@ -248,19 +218,6 @@ static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask); } -static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) -{ - struct set_mtrr_data data = { .smp_reg = reg, - .smp_base = base, - .smp_size = size, - .smp_type = type - }; - - stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data, - cpu_callout_mask); -} - /** * mtrr_add_page - Add a memory type region * @base: Physical base address of region in pages (in units of 4 kB!) @@ -617,20 +574,6 @@ int arch_phys_wc_index(int handle) } EXPORT_SYMBOL_GPL(arch_phys_wc_index); -/* - * HACK ALERT! - * These should be called implicitly, but we can't yet until all the initcall - * stuff is done... - */ -static void __init init_ifs(void) -{ -#ifndef CONFIG_X86_64 - amd_init_mtrr(); - cyrix_init_mtrr(); - centaur_init_mtrr(); -#endif -} - /* The suspend/resume methods are only for CPU without MTRR. CPU using generic * MTRR driver doesn't require this */ @@ -686,10 +629,9 @@ int __initdata changed_by_mtrr_cleanup; */ void __init mtrr_bp_init(void) { + const char *why = "(not available)"; u32 phys_addr; - init_ifs(); - phys_addr = 32; if (boot_cpu_has(X86_FEATURE_MTRR)) { @@ -730,21 +672,21 @@ void __init mtrr_bp_init(void) case X86_VENDOR_AMD: if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) { /* Pre-Athlon (K6) AMD CPU MTRRs */ - mtrr_if = mtrr_ops[X86_VENDOR_AMD]; + mtrr_if = &amd_mtrr_ops; size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } break; case X86_VENDOR_CENTAUR: if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) { - mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; + mtrr_if = ¢aur_mtrr_ops; size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } break; case X86_VENDOR_CYRIX: if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) { - mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; + mtrr_if = &cyrix_mtrr_ops; size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } @@ -754,58 +696,23 @@ void __init mtrr_bp_init(void) } } - if (mtrr_if) { - __mtrr_enabled = true; - set_num_var_ranges(); + if (mtrr_enabled()) { + set_num_var_ranges(mtrr_if == &generic_mtrr_ops); init_table(); - if (use_intel()) { + if (mtrr_if == &generic_mtrr_ops) { /* BIOS may override */ - __mtrr_enabled = get_mtrr_state(); - - if (mtrr_enabled()) - mtrr_bp_pat_init(); - - if (mtrr_cleanup(phys_addr)) { - changed_by_mtrr_cleanup = 1; - mtrr_if->set_all(); + if (get_mtrr_state()) { + memory_caching_control |= CACHE_MTRR; + changed_by_mtrr_cleanup = mtrr_cleanup(phys_addr); + } else { + mtrr_if = NULL; + why = "by BIOS"; } } } - if (!mtrr_enabled()) { - pr_info("Disabled\n"); - - /* - * PAT initialization relies on MTRR's rendezvous handler. - * Skip PAT init until the handler can initialize both - * features independently. - */ - pat_disable("MTRRs disabled, skipping PAT initialization too."); - } -} - -void mtrr_ap_init(void) -{ if (!mtrr_enabled()) - return; - - if (!use_intel() || mtrr_aps_delayed_init) - return; - - /* - * Ideally we should hold mtrr_mutex here to avoid mtrr entries - * changed, but this routine will be called in cpu boot time, - * holding the lock breaks it. - * - * This routine is called in two cases: - * - * 1. very early time of software resume, when there absolutely - * isn't mtrr entry changes; - * - * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug - * lock to prevent mtrr entry changes - */ - set_mtrr_from_inactive_cpu(~0U, 0, 0, 0); + pr_info("MTRRs disabled %s\n", why); } /** @@ -823,50 +730,12 @@ void mtrr_save_state(void) smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); } -void set_mtrr_aps_delayed_init(void) -{ - if (!mtrr_enabled()) - return; - if (!use_intel()) - return; - - mtrr_aps_delayed_init = true; -} - -/* - * Delayed MTRR initialization for all AP's - */ -void mtrr_aps_init(void) -{ - if (!use_intel() || !mtrr_enabled()) - return; - - /* - * Check if someone has requested the delay of AP MTRR initialization, - * by doing set_mtrr_aps_delayed_init(), prior to this point. If not, - * then we are done. - */ - if (!mtrr_aps_delayed_init) - return; - - set_mtrr(~0U, 0, 0, 0); - mtrr_aps_delayed_init = false; -} - -void mtrr_bp_restore(void) -{ - if (!use_intel() || !mtrr_enabled()) - return; - - mtrr_if->set_all(); -} - static int __init mtrr_init_finialize(void) { if (!mtrr_enabled()) return 0; - if (use_intel()) { + if (memory_caching_control & CACHE_MTRR) { if (!changed_by_mtrr_cleanup) mtrr_state_warn(); return 0; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 2ac99e561181..02eb5871492d 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -14,11 +14,8 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { u32 vendor; - u32 use_intel_if; void (*set)(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); - void (*set_all)(void); - void (*get)(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type); int (*get_free_region)(unsigned long base, unsigned long size, @@ -53,15 +50,11 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); bool get_mtrr_state(void); -void mtrr_bp_pat_init(void); - -extern void __init set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; extern const struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) -#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) extern unsigned int num_var_ranges; extern u64 mtrr_tom2; @@ -71,10 +64,10 @@ void mtrr_state_warn(void); const char *mtrr_attrib_to_str(int x); void mtrr_wrmsr(unsigned, unsigned, unsigned); -/* CPU specific mtrr init functions */ -int amd_init_mtrr(void); -int cyrix_init_mtrr(void); -int centaur_init_mtrr(void); +/* CPU specific mtrr_ops vectors. */ +extern const struct mtrr_ops amd_mtrr_ops; +extern const struct mtrr_ops cyrix_mtrr_ops; +extern const struct mtrr_ops centaur_mtrr_ops; extern int changed_by_mtrr_cleanup; extern int mtrr_cleanup(unsigned address_bits); diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index bb1c3f5f60c8..c98e52ff5f20 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -66,9 +66,6 @@ struct rdt_hw_resource rdt_resources_all[] = { .rid = RDT_RESOURCE_L3, .name = "L3", .cache_level = 3, - .cache = { - .min_cbm_bits = 1, - }, .domains = domain_init(RDT_RESOURCE_L3), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -83,9 +80,6 @@ struct rdt_hw_resource rdt_resources_all[] = { .rid = RDT_RESOURCE_L2, .name = "L2", .cache_level = 2, - .cache = { - .min_cbm_bits = 1, - }, .domains = domain_init(RDT_RESOURCE_L2), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -147,7 +141,6 @@ static inline void cache_alloc_hsw_probe(void) r->cache.shareable_bits = 0xc0000; r->cache.min_cbm_bits = 2; r->alloc_capable = true; - r->alloc_enabled = true; rdt_alloc_capable = true; } @@ -211,7 +204,6 @@ static bool __get_mem_config_intel(struct rdt_resource *r) thread_throttle_mode_init(); r->alloc_capable = true; - r->alloc_enabled = true; return true; } @@ -242,7 +234,6 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) r->data_width = 4; r->alloc_capable = true; - r->alloc_enabled = true; return true; } @@ -261,7 +252,6 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; r->alloc_capable = true; - r->alloc_enabled = true; } static void rdt_get_cdp_config(int level) @@ -300,7 +290,7 @@ mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) * that can be written to QOS_MSRs. * There are currently no SKUs which support non linear delay values. */ -u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) +static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { if (r->membw.delay_linear) return MAX_MBA_BW - bw; @@ -401,7 +391,7 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, return NULL; } -void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) +static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); int i; @@ -410,12 +400,17 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) * Initialize the Control MSRs to having no control. * For Cache Allocation: Set all bits in cbm * For Memory Allocation: Set b/w requested to 100% - * and the bandwidth in MBps to U32_MAX */ - for (i = 0; i < hw_res->num_closid; i++, dc++, dm++) { + for (i = 0; i < hw_res->num_closid; i++, dc++) *dc = r->default_ctrl; - *dm = MBA_MAX_MBPS; - } +} + +static void domain_free(struct rdt_hw_domain *hw_dom) +{ + kfree(hw_dom->arch_mbm_total); + kfree(hw_dom->arch_mbm_local); + kfree(hw_dom->ctrl_val); + kfree(hw_dom); } static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) @@ -423,23 +418,15 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct msr_param m; - u32 *dc, *dm; + u32 *dc; dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val), GFP_KERNEL); if (!dc) return -ENOMEM; - dm = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->mbps_val), - GFP_KERNEL); - if (!dm) { - kfree(dc); - return -ENOMEM; - } - hw_dom->ctrl_val = dc; - hw_dom->mbps_val = dm; - setup_default_ctrlval(r, dc, dm); + setup_default_ctrlval(r, dc); m.low = 0; m.high = hw_res->num_closid; @@ -447,39 +434,31 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) return 0; } -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +/** + * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters + * @num_rmid: The size of the MBM counter array + * @hw_dom: The domain that owns the allocated arrays + */ +static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) { size_t tsize; - if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); - if (!d->rmid_busy_llc) - return -ENOMEM; - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - } if (is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); + tsize = sizeof(*hw_dom->arch_mbm_total); + hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_total) return -ENOMEM; - } } if (is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); + tsize = sizeof(*hw_dom->arch_mbm_local); + hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_local) { + kfree(hw_dom->arch_mbm_total); + hw_dom->arch_mbm_total = NULL; return -ENOMEM; } } - if (is_mbm_enabled()) { - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); - } - return 0; } @@ -502,6 +481,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) struct list_head *add_pos = NULL; struct rdt_hw_domain *hw_dom; struct rdt_domain *d; + int err; d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { @@ -527,25 +507,22 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) rdt_domain_reconfigure_cdp(r); if (r->alloc_capable && domain_setup_ctrlval(r, d)) { - kfree(hw_dom); + domain_free(hw_dom); return; } - if (r->mon_capable && domain_setup_mon_state(r, d)) { - kfree(hw_dom->ctrl_val); - kfree(hw_dom->mbps_val); - kfree(hw_dom); + if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + domain_free(hw_dom); return; } list_add_tail(&d->list, add_pos); - /* - * If resctrl is mounted, add - * per domain monitor data directories. - */ - if (static_branch_unlikely(&rdt_mon_enable_key)) - mkdir_mondata_subdir_allrdtgrp(r, d); + err = resctrl_online_domain(r, d); + if (err) { + list_del(&d->list); + domain_free(hw_dom); + } } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -563,27 +540,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { - /* - * If resctrl is mounted, remove all the - * per domain monitor data directories. - */ - if (static_branch_unlikely(&rdt_mon_enable_key)) - rmdir_mondata_subdir_allrdtgrp(r, d->id); + resctrl_offline_domain(r, d); list_del(&d->list); - if (r->mon_capable && is_mbm_enabled()) - cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { - /* - * When a package is going down, forcefully - * decrement rmid->ebusy. There is no way to know - * that the L3 was flushed and hence may lead to - * incorrect counts in rare scenarios, but leaving - * the RMID as busy creates RMID leaks if the - * package never comes back. - */ - __check_limbo(d, true); - cancel_delayed_work(&d->cqm_limbo); - } /* * rdt_domain "d" is going to be freed below, so clear @@ -591,13 +549,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) */ if (d->plr) d->plr->d = NULL; + domain_free(hw_dom); - kfree(hw_dom->ctrl_val); - kfree(hw_dom->mbps_val); - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); - kfree(hw_dom); return; } @@ -622,7 +575,7 @@ static void clear_closid_rmid(int cpu) state->default_rmid = 0; state->cur_closid = 0; state->cur_rmid = 0; - wrmsr(IA32_PQR_ASSOC, 0, 0); + wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); } static int resctrl_online_cpu(unsigned int cpu) @@ -875,8 +828,8 @@ static __init void rdt_init_res_defs_intel(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { r->cache.arch_has_sparse_bitmaps = false; - r->cache.arch_has_empty_bitmaps = false; r->cache.arch_has_per_cpu_cfg = false; + r->cache.min_cbm_bits = 1; } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE; hw_res->msr_update = mba_wrmsr_intel; @@ -895,8 +848,8 @@ static __init void rdt_init_res_defs_amd(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { r->cache.arch_has_sparse_bitmaps = true; - r->cache.arch_has_empty_bitmaps = true; r->cache.arch_has_per_cpu_cfg = true; + r->cache.min_cbm_bits = 0; } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_BW_BASE; hw_res->msr_update = mba_wrmsr_amd; diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 87666275eed9..1df0e3262bca 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -61,6 +61,7 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_domain *d) { struct resctrl_staged_config *cfg; + u32 closid = data->rdtgrp->closid; struct rdt_resource *r = s->res; unsigned long bw_val; @@ -72,6 +73,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; + + if (is_mba_sc(r)) { + d->mbps_val[closid] = bw_val; + return 0; + } + cfg->new_ctrl = bw_val; cfg->have_new_ctrl = true; @@ -98,8 +105,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) return false; } - if ((!r->cache.arch_has_empty_bitmaps && val == 0) || - val > r->default_ctrl) { + if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) { rdt_last_cmd_puts("Mask out of range\n"); return false; } @@ -261,14 +267,13 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type) static bool apply_config(struct rdt_hw_domain *hw_dom, struct resctrl_staged_config *cfg, u32 idx, - cpumask_var_t cpu_mask, bool mba_sc) + cpumask_var_t cpu_mask) { struct rdt_domain *dom = &hw_dom->d_resctrl; - u32 *dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val; - if (cfg->new_ctrl != dc[idx]) { + if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) { cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask); - dc[idx] = cfg->new_ctrl; + hw_dom->ctrl_val[idx] = cfg->new_ctrl; return true; } @@ -276,6 +281,27 @@ static bool apply_config(struct rdt_hw_domain *hw_dom, return false; } +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + u32 idx = get_config_index(closid, t); + struct msr_param msr_param; + + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + return -EINVAL; + + hw_dom->ctrl_val[idx] = cfg_val; + + msr_param.res = r; + msr_param.low = idx; + msr_param.high = idx + 1; + hw_res->msr_update(d, &msr_param, r); + + return 0; +} + int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) { struct resctrl_staged_config *cfg; @@ -284,14 +310,12 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) enum resctrl_conf_type t; cpumask_var_t cpu_mask; struct rdt_domain *d; - bool mba_sc; int cpu; u32 idx; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; - mba_sc = is_mba_sc(r); msr_param.res = NULL; list_for_each_entry(d, &r->domains, list) { hw_dom = resctrl_to_arch_dom(d); @@ -301,7 +325,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) continue; idx = get_config_index(closid, t); - if (!apply_config(hw_dom, cfg, idx, cpu_mask, mba_sc)) + if (!apply_config(hw_dom, cfg, idx, cpu_mask)) continue; if (!msr_param.res) { @@ -315,11 +339,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) } } - /* - * Avoid writing the control msr with control values when - * MBA software controller is enabled - */ - if (cpumask_empty(cpu_mask) || mba_sc) + if (cpumask_empty(cpu_mask)) goto done; cpu = get_cpu(); /* Update resource control msr on this CPU if it's in cpu_mask. */ @@ -406,6 +426,14 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; + + /* + * Writes to mba_sc resources update the software controller, + * not the control MSR. + */ + if (is_mba_sc(r)) + continue; + ret = resctrl_arch_update_domains(r, rdtgrp->closid); if (ret) goto out; @@ -433,9 +461,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); u32 idx = get_config_index(closid, type); - if (!is_mba_sc(r)) - return hw_dom->ctrl_val[idx]; - return hw_dom->mbps_val[idx]; + return hw_dom->ctrl_val[idx]; } static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) @@ -450,8 +476,12 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo if (sep) seq_puts(s, ";"); - ctrl_val = resctrl_arch_get_config(r, dom, closid, - schema->conf_type); + if (is_mba_sc(r)) + ctrl_val = dom->mbps_val[closid]; + else + ctrl_val = resctrl_arch_get_config(r, dom, closid, + schema->conf_type); + seq_printf(s, r->format_str, dom->id, max_data_width, ctrl_val); sep = true; @@ -518,7 +548,6 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; - struct rdt_hw_resource *hw_res; u32 resid, evtid, domid; struct rdtgroup *rdtgrp; struct rdt_resource *r; @@ -538,8 +567,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) domid = md.u.domid; evtid = md.u.evtid; - hw_res = &rdt_resources_all[resid]; - r = &hw_res->r_resctrl; + r = &rdt_resources_all[resid].r_resctrl; d = rdt_find_domain(r, domid, NULL); if (IS_ERR_OR_NULL(d)) { ret = -ENOENT; @@ -548,12 +576,12 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) mon_event_read(&rr, r, d, rdtgrp, evtid, false); - if (rr.val & RMID_VAL_ERROR) + if (rr.err == -EIO) seq_puts(m, "Error\n"); - else if (rr.val & RMID_VAL_UNAVAIL) + else if (rr.err == -EINVAL) seq_puts(m, "Unavailable\n"); else - seq_printf(m, "%llu\n", rr.val * hw_res->mon_scale); + seq_printf(m, "%llu\n", rr.val); out: rdtgroup_kn_unlock(of->kn); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 1d647188a43b..5ebd28e6aa0c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -8,35 +8,16 @@ #include <linux/fs_context.h> #include <linux/jump_label.h> -#define MSR_IA32_L3_QOS_CFG 0xc81 -#define MSR_IA32_L2_QOS_CFG 0xc82 -#define MSR_IA32_L3_CBM_BASE 0xc90 -#define MSR_IA32_L2_CBM_BASE 0xd10 -#define MSR_IA32_MBA_THRTL_BASE 0xd50 -#define MSR_IA32_MBA_BW_BASE 0xc0000200 - -#define MSR_IA32_QM_CTR 0x0c8e -#define MSR_IA32_QM_EVTSEL 0x0c8d - #define L3_QOS_CDP_ENABLE 0x01ULL #define L2_QOS_CDP_ENABLE 0x01ULL -/* - * Event IDs are used to program IA32_QM_EVTSEL before reading event - * counter from IA32_QM_CTR - */ -#define QOS_L3_OCCUP_EVENT_ID 0x01 -#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 -#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 - #define CQM_LIMBOCHECK_INTERVAL 1000 #define MBM_CNTR_WIDTH_BASE 24 #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 -#define MBA_MAX_MBPS U32_MAX #define MAX_MBA_BW_AMD 0x800 #define MBM_CNTR_WIDTH_OFFSET_AMD 20 @@ -74,7 +55,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); * @list: entry in &rdt_resource->evt_list */ struct mon_evt { - u32 evtid; + enum resctrl_event_id evtid; char *name; struct list_head list; }; @@ -91,9 +72,9 @@ struct mon_evt { union mon_data_bits { void *priv; struct { - unsigned int rid : 10; - unsigned int evtid : 8; - unsigned int domid : 14; + unsigned int rid : 10; + enum resctrl_event_id evtid : 8; + unsigned int domid : 14; } u; }; @@ -101,12 +82,12 @@ struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; struct rdt_domain *d; - int evtid; + enum resctrl_event_id evtid; bool first; + int err; u64 val; }; -extern unsigned int resctrl_cqm_threshold; extern bool rdt_alloc_capable; extern bool rdt_mon_capable; extern unsigned int rdt_mon_features; @@ -288,35 +269,45 @@ struct rftype { /** * struct mbm_state - status for each MBM counter in each domain - * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) - * @prev_msr: Value of IA32_QM_CTR for this RMID last time we read it - * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting + * @prev_bw_bytes: Previous bytes value read for bandwidth calculation * @prev_bw: The most recent bandwidth in MBps * @delta_bw: Difference between the current and previous bandwidth * @delta_comp: Indicates whether to compute the delta_bw */ struct mbm_state { - u64 chunks; - u64 prev_msr; - u64 prev_bw_msr; + u64 prev_bw_bytes; u32 prev_bw; u32 delta_bw; bool delta_comp; }; /** + * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s + * return value. + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr: Value of IA32_QM_CTR last time it was read for the RMID used to + * find this struct. + */ +struct arch_mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share * a resource * @d_resctrl: Properties exposed to the resctrl file system * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) - * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps + * @arch_mbm_total: arch private state for MBM total bandwidth + * @arch_mbm_local: arch private state for MBM local bandwidth * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_domain { struct rdt_domain d_resctrl; u32 *ctrl_val; - u32 *mbps_val; + struct arch_mbm_state *arch_mbm_total; + struct arch_mbm_state *arch_mbm_local; }; static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r) @@ -459,14 +450,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); for_each_rdt_resource(r) \ if (r->mon_capable) -#define for_each_alloc_enabled_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->alloc_enabled) - -#define for_each_mon_enabled_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->mon_enabled) - /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { struct { @@ -530,10 +513,6 @@ void free_rmid(u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); -void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - unsigned int dom_id); -void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain *d); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first); @@ -542,8 +521,6 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, void mbm_handle_overflow(struct work_struct *work); void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); -void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); -u32 delay_bw_map(unsigned long bw, struct rdt_resource *r); void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index eaf25a234ff5..efe0c30d3a12 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -16,8 +16,12 @@ */ #include <linux/module.h> +#include <linux/sizes.h> #include <linux/slab.h> + #include <asm/cpu_device_id.h> +#include <asm/resctrl.h> + #include "internal.h" struct rmid_entry { @@ -37,8 +41,8 @@ static LIST_HEAD(rmid_free_lru); * @rmid_limbo_count count of currently unused but (potentially) * dirty RMIDs. * This counts RMIDs that no one is currently using but that - * may have a occupancy value > intel_cqm_threshold. User can change - * the threshold occupancy value. + * may have a occupancy value > resctrl_rmid_realloc_threshold. User can + * change the threshold occupancy value. */ static unsigned int rmid_limbo_count; @@ -59,10 +63,15 @@ bool rdt_mon_capable; unsigned int rdt_mon_features; /* - * This is the threshold cache occupancy at which we will consider an + * This is the threshold cache occupancy in bytes at which we will consider an * RMID available for re-allocation. */ -unsigned int resctrl_cqm_threshold; +unsigned int resctrl_rmid_realloc_threshold; + +/* + * This is the maximum value for the reallocation threshold, in bytes. + */ +unsigned int resctrl_rmid_realloc_limit; #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) @@ -137,9 +146,54 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid) return entry; } -static u64 __rmid_read(u32 rmid, u32 eventid) +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, + u32 rmid, + enum resctrl_event_id eventid) { - u64 val; + switch (eventid) { + case QOS_L3_OCCUP_EVENT_ID: + return NULL; + case QOS_L3_MBM_TOTAL_EVENT_ID: + return &hw_dom->arch_mbm_total[rmid]; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return &hw_dom->arch_mbm_local[rmid]; + } + + /* Never expect to get here */ + WARN_ON_ONCE(1); + + return NULL; +} + +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid) +{ + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct arch_mbm_state *am; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) + memset(am, 0, sizeof(*am)); +} + +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) +{ + u64 shift = 64 - width, chunks; + + chunks = (cur_msr << shift) - (prev_msr << shift); + return chunks >> shift; +} + +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 *val) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct arch_mbm_state *am; + u64 msr_val, chunks; + + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + return -EINVAL; /* * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured @@ -150,16 +204,26 @@ static u64 __rmid_read(u32 rmid, u32 eventid) * are error bits. */ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - - return val; -} + rdmsrl(MSR_IA32_QM_CTR, msr_val); + + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + am->chunks += mbm_overflow_count(am->prev_msr, msr_val, + hw_res->mbm_width); + chunks = get_corrected_mbm_count(rmid, am->chunks); + am->prev_msr = msr_val; + } else { + chunks = msr_val; + } -static bool rmid_dirty(struct rmid_entry *entry) -{ - u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + *val = chunks * hw_res->mon_scale; - return val >= resctrl_cqm_threshold; + return 0; } /* @@ -170,11 +234,11 @@ static bool rmid_dirty(struct rmid_entry *entry) */ void __check_limbo(struct rdt_domain *d, bool force_free) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rmid_entry *entry; - struct rdt_resource *r; u32 crmid = 1, nrmid; - - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + bool rmid_dirty; + u64 val = 0; /* * Skip RMID 0 and start from RMID 1 and check all the RMIDs that @@ -188,7 +252,15 @@ void __check_limbo(struct rdt_domain *d, bool force_free) break; entry = __rmid_entry(nrmid); - if (force_free || !rmid_dirty(entry)) { + + if (resctrl_arch_rmid_read(r, d, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val)) { + rmid_dirty = true; + } else { + rmid_dirty = (val >= resctrl_rmid_realloc_threshold); + } + + if (force_free || !rmid_dirty) { clear_bit(entry->rmid, d->rmid_busy_llc); if (!--entry->busy) { rmid_limbo_count--; @@ -227,19 +299,19 @@ int alloc_rmid(void) static void add_rmid_to_limbo(struct rmid_entry *entry) { - struct rdt_resource *r; + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_domain *d; - int cpu; - u64 val; - - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + int cpu, err; + u64 val = 0; entry->busy = 0; cpu = get_cpu(); list_for_each_entry(d, &r->domains, list) { if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); - if (val <= resctrl_cqm_threshold) + err = resctrl_arch_rmid_read(r, d, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, + &val); + if (err || val <= resctrl_rmid_realloc_threshold) continue; } @@ -277,24 +349,18 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) +static int __mon_event_count(u32 rmid, struct rmid_read *rr) { - u64 shift = 64 - width, chunks; + struct mbm_state *m; + u64 tval = 0; - chunks = (cur_msr << shift) - (prev_msr << shift); - return chunks >> shift; -} + if (rr->first) + resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); -static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) -{ - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r); - struct mbm_state *m; - u64 chunks, tval; + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); + if (rr->err) + return rr->err; - tval = __rmid_read(rmid, rr->evtid); - if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { - return tval; - } switch (rr->evtid) { case QOS_L3_OCCUP_EVENT_ID: rr->val += tval; @@ -308,48 +374,47 @@ static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) default: /* * Code would never reach here because an invalid - * event id would fail the __rmid_read. + * event id would fail in resctrl_arch_rmid_read(). */ - return RMID_VAL_ERROR; + return -EINVAL; } if (rr->first) { memset(m, 0, sizeof(struct mbm_state)); - m->prev_bw_msr = m->prev_msr = tval; return 0; } - chunks = mbm_overflow_count(m->prev_msr, tval, hw_res->mbm_width); - m->chunks += chunks; - m->prev_msr = tval; - - rr->val += get_corrected_mbm_count(rmid, m->chunks); + rr->val += tval; return 0; } /* + * mbm_bw_count() - Update bw count from values previously read by + * __mon_event_count(). + * @rmid: The rmid used to identify the cached mbm_state. + * @rr: The struct rmid_read populated by __mon_event_count(). + * * Supporting function to calculate the memory bandwidth - * and delta bandwidth in MBps. + * and delta bandwidth in MBps. The chunks value previously read by + * __mon_event_count() is compared with the chunks value from the previous + * invocation. This must be called once per second to maintain values in MBps. */ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) { - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r); struct mbm_state *m = &rr->d->mbm_local[rmid]; - u64 tval, cur_bw, chunks; + u64 cur_bw, bytes, cur_bytes; - tval = __rmid_read(rmid, rr->evtid); - if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; + cur_bytes = rr->val; + bytes = cur_bytes - m->prev_bw_bytes; + m->prev_bw_bytes = cur_bytes; - chunks = mbm_overflow_count(m->prev_bw_msr, tval, hw_res->mbm_width); - cur_bw = (get_corrected_mbm_count(rmid, chunks) * hw_res->mon_scale) >> 20; + cur_bw = bytes / SZ_1M; if (m->delta_comp) m->delta_bw = abs(cur_bw - m->prev_bw); m->delta_comp = false; m->prev_bw = cur_bw; - m->prev_bw_msr = tval; } /* @@ -361,11 +426,11 @@ void mon_event_count(void *info) struct rdtgroup *rdtgrp, *entry; struct rmid_read *rr = info; struct list_head *head; - u64 ret_val; + int ret; rdtgrp = rr->rgrp; - ret_val = __mon_event_count(rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp->mon.rmid, rr); /* * For Ctrl groups read data from child monitor groups and @@ -377,13 +442,17 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { if (__mon_event_count(entry->mon.rmid, rr) == 0) - ret_val = 0; + ret = 0; } } - /* Report error if none of rmid_reads are successful */ - if (ret_val) - rr->val = ret_val; + /* + * __mon_event_count() calls for newly created monitor groups may + * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. + * Discard error if any of the monitor event reads succeeded. + */ + if (ret == 0) + rr->err = 0; } /* @@ -420,10 +489,8 @@ void mon_event_count(void *info) */ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) { - u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val; + u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; - struct rdt_hw_resource *hw_r_mba; - struct rdt_hw_domain *hw_dom_mba; u32 cur_bw, delta_bw, user_bw; struct rdt_resource *r_mba; struct rdt_domain *dom_mba; @@ -433,8 +500,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) if (!is_mbm_local_enabled()) return; - hw_r_mba = &rdt_resources_all[RDT_RESOURCE_MBA]; - r_mba = &hw_r_mba->r_resctrl; + r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + closid = rgrp->closid; rmid = rgrp->mon.rmid; pmbm_data = &dom_mbm->mbm_local[rmid]; @@ -444,16 +511,13 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) pr_warn_once("Failure to get domain for MBA update\n"); return; } - hw_dom_mba = resctrl_to_arch_dom(dom_mba); cur_bw = pmbm_data->prev_bw; - user_bw = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); + user_bw = dom_mba->mbps_val[closid]; delta_bw = pmbm_data->delta_bw; - /* - * resctrl_arch_get_config() chooses the mbps/ctrl value to return - * based on is_mba_sc(). For now, reach into the hw_dom. - */ - cur_msr_val = hw_dom_mba->ctrl_val[closid]; + + /* MBA resource doesn't support CDP */ + cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); /* * For Ctrl groups read data from child monitor groups. @@ -488,9 +552,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) return; } - cur_msr = hw_r_mba->msr_base + closid; - wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba)); - hw_dom_mba->ctrl_val[closid] = new_msr_val; + resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); /* * Delta values are updated dynamically package wise for each @@ -523,10 +585,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) */ if (is_mbm_total_enabled()) { rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + rr.val = 0; __mon_event_count(rmid, &rr); } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + rr.val = 0; __mon_event_count(rmid, &rr); /* @@ -686,9 +750,10 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - unsigned int cl_size = boot_cpu_data.x86_cache_size; + unsigned int threshold; int ret; + resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; @@ -705,10 +770,14 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->num_rmid; - /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ - resctrl_cqm_threshold /= hw_res->mon_scale; + /* + * Because num_rmid may not be a power of two, round the value + * to the nearest multiple of hw_res->mon_scale so it matches a + * value the hardware will measure. mon_scale may not be a power of 2. + */ + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); ret = dom_data_init(r); if (ret) @@ -717,7 +786,6 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) l3_mon_evt_init(r); r->mon_capable = true; - r->mon_enabled = true; return 0; } diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index db813f819ad6..524f8ff3e69c 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -420,6 +420,7 @@ static int pseudo_lock_fn(void *_rdtgrp) struct pseudo_lock_region *plr = rdtgrp->plr; u32 rmid_p, closid_p; unsigned long i; + u64 saved_msr; #ifdef CONFIG_KASAN /* * The registers used for local register variables are also used @@ -463,6 +464,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * the buffer and evict pseudo-locked memory read earlier from the * cache. */ + saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); closid_p = this_cpu_read(pqr_state.cur_closid); rmid_p = this_cpu_read(pqr_state.cur_rmid); @@ -475,7 +477,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * pseudo-locked followed by reading of kernel memory to load it * into the cache. */ - __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); + __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); /* * Cache was flushed earlier. Now access kernel memory to read it * into cache region associated with just activated plr->closid. @@ -511,10 +513,10 @@ static int pseudo_lock_fn(void *_rdtgrp) * Critical section end: restore closid with capacity bitmask that * does not overlap with pseudo-locked region. */ - __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); + __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); local_irq_enable(); plr->thread_done = 1; @@ -835,7 +837,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) * First determine which cpus have pseudo-locked regions * associated with them. */ - for_each_alloc_enabled_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { list_for_each_entry(d_i, &r->domains, list) { if (d_i->plr) cpumask_or(cpu_with_psl, cpu_with_psl, @@ -871,6 +873,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) static int measure_cycles_lat_fn(void *_plr) { struct pseudo_lock_region *plr = _plr; + u32 saved_low, saved_high; unsigned long i; u64 start, end; void *mem_r; @@ -879,6 +882,7 @@ static int measure_cycles_lat_fn(void *_plr) /* * Disable hardware prefetchers. */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); mem_r = READ_ONCE(plr->kmem); /* @@ -895,7 +899,7 @@ static int measure_cycles_lat_fn(void *_plr) end = rdtsc_ordered(); trace_pseudo_lock_mem_latency((u32)(end - start)); } - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); local_irq_enable(); plr->thread_done = 1; wake_up_interruptible(&plr->lock_thread_wq); @@ -940,6 +944,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; struct perf_event *miss_event, *hit_event; int hit_pmcnum, miss_pmcnum; + u32 saved_low, saved_high; unsigned int line_size; unsigned int size; unsigned long i; @@ -973,6 +978,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, /* * Disable hardware prefetchers. */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); /* Initialize rest of local variables */ @@ -1031,7 +1037,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, */ rmb(); /* Re-enable hardware prefetchers */ - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); local_irq_enable(); out_hit: perf_event_release_kernel(hit_event); @@ -1554,9 +1560,9 @@ static const struct file_operations pseudo_lock_dev_fops = { .mmap = pseudo_lock_dev_mmap, }; -static char *pseudo_lock_devnode(struct device *dev, umode_t *mode) +static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) { - struct rdtgroup *rdtgrp; + const struct rdtgroup *rdtgrp; rdtgrp = dev_get_drvdata(dev); if (mode) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f276aff521e8..e5a48f05e787 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1030,10 +1030,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, static int max_threshold_occ_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - - seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale); + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); return 0; } @@ -1055,7 +1052,6 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct rdt_hw_resource *hw_res; unsigned int bytes; int ret; @@ -1063,11 +1059,10 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, if (ret) return ret; - if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + if (bytes > resctrl_rmid_realloc_limit) return -EINVAL; - hw_res = resctrl_to_arch_res(of->kn->parent->priv); - resctrl_cqm_threshold = bytes / hw_res->mon_scale; + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); return nbytes; } @@ -1356,11 +1351,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct resctrl_schema *schema; + enum resctrl_conf_type type; struct rdtgroup *rdtgrp; struct rdt_resource *r; struct rdt_domain *d; unsigned int size; int ret = 0; + u32 closid; bool sep; u32 ctrl; @@ -1386,8 +1383,11 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, goto out; } + closid = rdtgrp->closid; + list_for_each_entry(schema, &resctrl_schema_all, list) { r = schema->res; + type = schema->conf_type; sep = false; seq_printf(s, "%*s:", max_name_width, schema->name); list_for_each_entry(d, &r->domains, list) { @@ -1396,9 +1396,12 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { size = 0; } else { - ctrl = resctrl_arch_get_config(r, d, - rdtgrp->closid, - schema->conf_type); + if (is_mba_sc(r)) + ctrl = d->mbps_val[closid]; + else + ctrl = resctrl_arch_get_config(r, d, + closid, + type); if (r->rid == RDT_RESOURCE_MBA) size = ctrl; else @@ -1756,7 +1759,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) if (ret) goto out_destroy; - /* loop over enabled controls, these are all alloc_enabled */ + /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; fflags = r->fflags | RF_CTRL_INFO; @@ -1765,7 +1768,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) goto out_destroy; } - for_each_mon_enabled_rdt_resource(r) { + for_each_mon_capable_rdt_resource(r) { fflags = r->fflags | RF_MON_INFO; sprintf(name, "%s_MON", r->name); ret = rdtgroup_mkdir_info_resdir(r, name, fflags); @@ -1889,26 +1892,61 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) l3_qos_cfg_update(&hw_res->cdp_enabled); } +static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d) +{ + u32 num_closid = resctrl_arch_get_num_closid(r); + int cpu = cpumask_any(&d->cpu_mask); + int i; + + d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), + GFP_KERNEL, cpu_to_node(cpu)); + if (!d->mbps_val) + return -ENOMEM; + + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + + return 0; +} + +static void mba_sc_domain_destroy(struct rdt_resource *r, + struct rdt_domain *d) +{ + kfree(d->mbps_val); + d->mbps_val = NULL; +} + /* - * Enable or disable the MBA software controller - * which helps user specify bandwidth in MBps. * MBA software controller is supported only if * MBM is supported and MBA is in linear scale. */ +static bool supports_mba_mbps(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + + return (is_mbm_local_enabled() && + r->alloc_capable && is_mba_linear()); +} + +/* + * Enable or disable the MBA software controller + * which helps user specify bandwidth in MBps. + */ static int set_mba_sc(bool mba_sc) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; - struct rdt_hw_domain *hw_dom; + u32 num_closid = resctrl_arch_get_num_closid(r); struct rdt_domain *d; + int i; - if (!is_mbm_enabled() || !is_mba_linear() || - mba_sc == is_mba_sc(r)) + if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) return -EINVAL; r->membw.mba_sc = mba_sc; + list_for_each_entry(d, &r->domains, list) { - hw_dom = resctrl_to_arch_dom(d); - setup_default_ctrlval(r, hw_dom->ctrl_val, hw_dom->mbps_val); + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; } return 0; @@ -2106,7 +2144,7 @@ static int schemata_list_create(void) struct rdt_resource *r; int ret = 0; - for_each_alloc_enabled_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { if (resctrl_arch_get_cdp_enabled(r->rid)) { ret = schemata_list_add(r, CDP_CODE); if (ret) @@ -2261,7 +2299,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->enable_cdpl2 = true; return 0; case Opt_mba_mbps: - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (!supports_mba_mbps()) return -EINVAL; ctx->enable_mba_mbps = true; return 0; @@ -2452,7 +2490,7 @@ static void rdt_kill_sb(struct super_block *sb) set_mba_sc(false); /*Put everything back to default values. */ - for_each_alloc_enabled_rdt_resource(r) + for_each_alloc_capable_rdt_resource(r) reset_all_ctrls(r); cdp_disable_all(); rmdir_all_sub(); @@ -2499,14 +2537,12 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name, * Remove all subdirectories of mon_data of ctrl_mon groups * and monitor groups with given domain id. */ -void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id) { struct rdtgroup *prgrp, *crgrp; char name[32]; - if (!r->mon_enabled) - return; - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { sprintf(name, "mon_%s_%02d", r->name, dom_id); kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); @@ -2565,16 +2601,13 @@ out_destroy: * Add all subdirectories of mon_data for "ctrl_mon" groups * and "monitor" groups with given domain id. */ -void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain *d) +static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) { struct kernfs_node *parent_kn; struct rdtgroup *prgrp, *crgrp; struct list_head *head; - if (!r->mon_enabled) - return; - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { parent_kn = prgrp->mon.mon_data_kn; mkdir_mondata_subdir(parent_kn, d, r, prgrp); @@ -2642,7 +2675,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, * Create the subdirectories for each domain. Note that all events * in a domain like L3 are grouped into a resource whose domain is L3 */ - for_each_mon_enabled_rdt_resource(r) { + for_each_mon_capable_rdt_resource(r) { ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); if (ret) goto out_destroy; @@ -2786,14 +2819,19 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) } /* Initialize MBA resource with default values. */ -static void rdtgroup_init_mba(struct rdt_resource *r) +static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) { struct resctrl_staged_config *cfg; struct rdt_domain *d; list_for_each_entry(d, &r->domains, list) { + if (is_mba_sc(r)) { + d->mbps_val[closid] = MBA_MAX_MBPS; + continue; + } + cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl; + cfg->new_ctrl = r->default_ctrl; cfg->have_new_ctrl = true; } } @@ -2808,7 +2846,9 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; if (r->rid == RDT_RESOURCE_MBA) { - rdtgroup_init_mba(r); + rdtgroup_init_mba(r, rdtgrp->closid); + if (is_mba_sc(r)) + continue; } else { ret = rdtgroup_init_cat(s, rdtgrp->closid); if (ret < 0) @@ -3236,6 +3276,110 @@ out: return ret; } +static void domain_destroy_mon_state(struct rdt_domain *d) +{ + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); +} + +void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + lockdep_assert_held(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + mba_sc_domain_destroy(r, d); + + if (!r->mon_capable) + return; + + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); + + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + + domain_destroy_mon_state(d); +} + +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + bitmap_free(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + return 0; +} + +int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + int err; + + lockdep_assert_held(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + /* RDT_RESOURCE_MBA is never mon_capable */ + return mba_sc_domain_allocate(r, d); + + if (!r->mon_capable) + return 0; + + err = domain_setup_mon_state(r, d); + if (err) + return err; + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + if (is_llc_occupancy_enabled()) + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + + /* If resctrl is mounted, add per domain monitor data directories. */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); + + return 0; +} + /* * rdtgroup_init - rdtgroup initialization * diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index fd44b54c90d5..f53944fb8f7f 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,11 +40,13 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 24c1bb8eb196..2a0e90fe2abc 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,9 @@ #include "encls.h" #include "sgx.h" +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); + #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) /* * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to @@ -157,8 +160,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, return ret; pginfo.addr = encl_page->desc & PAGE_MASK; - pginfo.contents = (unsigned long)kmap_atomic(b.contents); - pcmd_page = kmap_atomic(b.pcmd); + pginfo.contents = (unsigned long)kmap_local_page(b.contents); + pcmd_page = kmap_local_page(b.pcmd); pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; if (secs_page) @@ -184,8 +187,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, */ pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); - kunmap_atomic(pcmd_page); - kunmap_atomic((void *)(unsigned long)pginfo.contents); + kunmap_local(pcmd_page); + kunmap_local((void *)(unsigned long)pginfo.contents); get_page(b.pcmd); sgx_encl_put_backing(&b); @@ -194,10 +197,10 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); - pcmd_page = kmap_atomic(b.pcmd); + pcmd_page = kmap_local_page(b.pcmd); if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) pr_warn("PCMD page not empty after truncate.\n"); - kunmap_atomic(pcmd_page); + kunmap_local(pcmd_page); } put_page(b.pcmd); @@ -265,7 +268,7 @@ static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, unsigned long addr, unsigned long vm_flags) { - unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; struct sgx_encl_page *entry; entry = xa_load(&encl->page_array, PFN_DOWN(addr)); @@ -344,8 +347,11 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, } va_page = sgx_encl_grow(encl, false); - if (IS_ERR(va_page)) + if (IS_ERR(va_page)) { + if (PTR_ERR(va_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; goto err_out_epc; + } if (va_page) list_add(&va_page->list, &encl->va_pages); @@ -496,7 +502,7 @@ static void sgx_vma_open(struct vm_area_struct *vma) int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, unsigned long end, unsigned long vm_flags) { - unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; struct sgx_encl_page *page; unsigned long count = 0; int ret = 0; @@ -674,11 +680,15 @@ const struct vm_operations_struct sgx_vm_ops = { void sgx_encl_release(struct kref *ref) { struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); + unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1); struct sgx_va_page *va_page; struct sgx_encl_page *entry; - unsigned long index; + unsigned long count = 0; + + XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base)); - xa_for_each(&encl->page_array, index, entry) { + xas_lock(&xas); + xas_for_each(&xas, entry, max_page_index) { if (entry->epc_page) { /* * The page and its radix tree entry cannot be freed @@ -693,9 +703,20 @@ void sgx_encl_release(struct kref *ref) } kfree(entry); - /* Invoke scheduler to prevent soft lockups. */ - cond_resched(); + /* + * Invoke scheduler on every XA_CHECK_SCHED iteration + * to prevent soft lockups. + */ + if (!(++count % XA_CHECK_SCHED)) { + xas_pause(&xas); + xas_unlock(&xas); + + cond_resched(); + + xas_lock(&xas); + } } + xas_unlock(&xas); xa_destroy(&encl->page_array); @@ -906,15 +927,14 @@ const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, pgoff_t index) { - struct inode *inode = encl->backing->f_path.dentry->d_inode; - struct address_space *mapping = inode->i_mapping; + struct address_space *mapping = encl->backing->f_mapping; gfp_t gfpmask = mapping_gfp_mask(mapping); return shmem_read_mapping_page_gfp(mapping, index, gfpmask); } /** - * sgx_encl_get_backing() - Pin the backing storage + * __sgx_encl_get_backing() - Pin the backing storage * @encl: an enclave pointer * @page_index: enclave page index * @backing: data for accessing backing storage for the page @@ -926,7 +946,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, * 0 on success, * -errno otherwise. */ -static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, +static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); @@ -1001,7 +1021,7 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) } /** - * sgx_encl_alloc_backing() - allocate a new backing storage page + * sgx_encl_alloc_backing() - create a new backing storage page * @encl: an enclave pointer * @page_index: enclave page index * @backing: data for accessing backing storage for the page @@ -1009,7 +1029,9 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) * When called from ksgxd, sets the active memcg from one of the * mms in the enclave's mm_list prior to any backing page allocation, * in order to ensure that shmem page allocations are charged to the - * enclave. + * enclave. Create a backing page for loading data back into an EPC page with + * ELDU. This function takes a reference on a new backing page which + * must be dropped with a corresponding call to sgx_encl_put_backing(). * * Return: * 0 on success, @@ -1022,7 +1044,7 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, struct mem_cgroup *memcg = set_active_memcg(encl_memcg); int ret; - ret = sgx_encl_get_backing(encl, page_index, backing); + ret = __sgx_encl_get_backing(encl, page_index, backing); set_active_memcg(memcg); mem_cgroup_put(encl_memcg); @@ -1040,15 +1062,17 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, * It is the caller's responsibility to ensure that it is appropriate to use * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is * not used correctly, this will cause an allocation which is not accounted for. + * This function takes a reference on an existing backing page which must be + * dropped with a corresponding call to sgx_encl_put_backing(). * * Return: * 0 on success, * -errno otherwise. */ -int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { - return sgx_encl_get_backing(encl, page_index, backing); + return __sgx_encl_get_backing(encl, page_index, backing); } /** diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index a65a952116fd..f94ff14c9486 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -107,8 +107,6 @@ bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); -int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, - struct sgx_backing *backing); int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); void sgx_encl_put_backing(struct sgx_backing *backing); diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index ebe79d60619f..21ca0a831b70 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -111,7 +111,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) encl->base = secs->base; encl->size = secs->size; encl->attributes = secs->attributes; - encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS; + encl->attributes_mask = SGX_ATTR_UNPRIV_MASK; /* Set only after completion, as encl->lock has not been taken. */ set_bit(SGX_ENCL_CREATED, &encl->flags); @@ -221,11 +221,11 @@ static int __sgx_encl_add_page(struct sgx_encl *encl, pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); pginfo.addr = encl_page->desc & PAGE_MASK; pginfo.metadata = (unsigned long)secinfo; - pginfo.contents = (unsigned long)kmap_atomic(src_page); + pginfo.contents = (unsigned long)kmap_local_page(src_page); ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page)); - kunmap_atomic((void *)pginfo.contents); + kunmap_local((void *)pginfo.contents); put_page(src_page); return ret ? -EIO : 0; @@ -356,6 +356,9 @@ static int sgx_validate_offset_length(struct sgx_encl *encl, if (!length || !IS_ALIGNED(length, PAGE_SIZE)) return -EINVAL; + if (offset + length < offset) + return -EINVAL; + if (offset + length - PAGE_SIZE >= encl->size) return -EINVAL; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 515e2a5f25bb..e5a37b6e9aa5 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -49,9 +49,13 @@ static LIST_HEAD(sgx_dirty_page_list); * Reset post-kexec EPC pages to the uninitialized state. The pages are removed * from the input list, and made available for the page allocator. SECS pages * prepending their children in the input list are left intact. + * + * Return 0 when sanitization was successful or kthread was stopped, and the + * number of unsanitized pages otherwise. */ -static void __sgx_sanitize_pages(struct list_head *dirty_page_list) +static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) { + unsigned long left_dirty = 0; struct sgx_epc_page *page; LIST_HEAD(dirty); int ret; @@ -59,7 +63,7 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) /* dirty_page_list is thread-local, no need for a lock: */ while (!list_empty(dirty_page_list)) { if (kthread_should_stop()) - return; + return 0; page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); @@ -92,12 +96,14 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) } else { /* The page is not yet clean - move to the dirty list. */ list_move_tail(&page->list, &dirty); + left_dirty++; } cond_resched(); } list_splice(&dirty, dirty_page_list); + return left_dirty; } static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) @@ -159,17 +165,17 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, pginfo.addr = 0; pginfo.secs = 0; - pginfo.contents = (unsigned long)kmap_atomic(backing->contents); - pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) + + pginfo.contents = (unsigned long)kmap_local_page(backing->contents); + pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) + backing->pcmd_offset; ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); set_page_dirty(backing->pcmd); set_page_dirty(backing->contents); - kunmap_atomic((void *)(unsigned long)(pginfo.metadata - + kunmap_local((void *)(unsigned long)(pginfo.metadata - backing->pcmd_offset)); - kunmap_atomic((void *)(unsigned long)pginfo.contents); + kunmap_local((void *)(unsigned long)pginfo.contents); return ret; } @@ -395,10 +401,7 @@ static int ksgxd(void *p) * required for SECS pages, whose child pages blocked EREMOVE. */ __sgx_sanitize_pages(&sgx_dirty_page_list); - __sgx_sanitize_pages(&sgx_dirty_page_list); - - /* sanity check: */ - WARN_ON(!list_empty(&sgx_dirty_page_list)); + WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); while (!kthread_should_stop()) { if (try_to_freeze()) diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 132a2de44d2f..5e868b62a7c4 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -96,6 +96,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; unsigned int core_select_mask, core_level_siblings; unsigned int die_select_mask, die_level_siblings; + unsigned int pkg_mask_width; bool die_level_present = false; int leaf; @@ -111,10 +112,10 @@ int detect_extended_topology(struct cpuinfo_x86 *c) core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; - do { + while (true) { cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); /* @@ -132,10 +133,15 @@ int detect_extended_topology(struct cpuinfo_x86 *c) die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } + if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) + pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + else + break; + sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + } - core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; die_select_mask = (~(-1 << die_plus_mask_width)) >> core_plus_mask_width; @@ -148,7 +154,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) } c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, - die_plus_mask_width); + pkg_mask_width); /* * Reinit the apicid, now that we have extended initial_apicid. */ diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index ec7bbac3a9f2..8009c8346d8f 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -58,24 +58,6 @@ static void tsx_enable(void) wrmsrl(MSR_IA32_TSX_CTRL, tsx); } -static bool tsx_ctrl_is_supported(void) -{ - u64 ia32_cap = x86_read_arch_cap_msr(); - - /* - * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this - * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. - * - * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a - * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES - * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get - * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, - * tsx= cmdline requests will do nothing on CPUs without - * MSR_IA32_TSX_CTRL support. - */ - return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); -} - static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) { if (boot_cpu_has_bug(X86_BUG_TAA)) @@ -135,7 +117,7 @@ static void tsx_clear_cpuid(void) rdmsrl(MSR_TSX_FORCE_ABORT, msr); msr |= MSR_TFA_TSX_CPUID_CLEAR; wrmsrl(MSR_TSX_FORCE_ABORT, msr); - } else if (tsx_ctrl_is_supported()) { + } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) { rdmsrl(MSR_IA32_TSX_CTRL, msr); msr |= TSX_CTRL_CPUID_CLEAR; wrmsrl(MSR_IA32_TSX_CTRL, msr); @@ -158,7 +140,8 @@ static void tsx_dev_mode_disable(void) u64 mcu_opt_ctrl; /* Check if RTM_ALLOW exists */ - if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() || + if (!boot_cpu_has_bug(X86_BUG_TAA) || + !cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL) || !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL)) return; @@ -191,7 +174,20 @@ void __init tsx_init(void) return; } - if (!tsx_ctrl_is_supported()) { + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + if (x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR) { + setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL); + } else { tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED; return; } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 6f7b8cc1bc9f..621ba9c0f17a 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -139,7 +139,7 @@ static int cpuid_device_destroy(unsigned int cpu) return 0; } -static char *cpuid_devnode(struct device *dev, umode_t *mode) +static char *cpuid_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index e75bc2f217ff..32d710f7eb84 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -57,7 +57,7 @@ ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) struct kvec kvec = { .iov_base = buf, .iov_len = count }; struct iov_iter iter; - iov_iter_kvec(&iter, READ, &kvec, 1, count); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)); diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 5cd51f25f446..28da5dd83fc0 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -31,11 +31,6 @@ char __initdata cmd_line[COMMAND_LINE_SIZE]; int __initdata of_ioapic; -void __init early_init_dt_add_memory_arch(u64 base, u64 size) -{ - BUG(); -} - void __init add_dtb(u64 data) { initial_dtb = data + offsetof(struct setup_data, data); @@ -167,7 +162,14 @@ static void __init dtb_lapic_setup(void) return; } smp_found_config = 1; - pic_mode = 1; + if (of_property_read_bool(dn, "intel,virtual-wire-mode")) { + pr_info("Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } else { + pr_info("IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } + register_lapic_address(lapic_addr); } @@ -248,7 +250,7 @@ static void __init dtb_add_ioapic(struct device_node *dn) ret = of_address_to_resource(dn, 0, &r); if (ret) { - printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn); + pr_err("Can't obtain address from device node %pOF.\n", dn); return; } mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg); @@ -265,7 +267,7 @@ static void __init dtb_ioapic_setup(void) of_ioapic = 1; return; } - printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); + pr_err("Error: No information about IO-APIC in OF.\n"); } #else static void __init dtb_ioapic_setup(void) {} diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index afae4dd77495..0bf6779187dd 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -128,7 +128,7 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl) /* No access to the user space stack of other tasks. Ignore. */ break; default: - printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n", + printk("%sCode: Unable to access opcode bytes at 0x%lx.\n", loglvl, prologue); break; } @@ -177,6 +177,12 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } } +/* + * This function reads pointers from the stack and dereferences them. The + * pointers may not have their KMSAN shadow set up properly, which may result + * in false positive reports. Disable instrumentation to avoid those. + */ +__no_kmsan_checks static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, const char *log_lvl) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 722fd712e1cf..b4905d5173fd 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* @@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6c5defd6569a..f05339fee778 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *begin; /* diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 68b38925a74f..44f937015e1e 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -264,11 +264,11 @@ static __init void early_pci_serial_init(char *s) bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); /* - * Verify it is a UART type device + * Verify it is a 16550-UART type device */ if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) && (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) || - (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ { + (((classcode >> 8) & 0xff) != PCI_SERIAL_16550_COMPATIBLE)) { if (!force) return; } @@ -276,22 +276,22 @@ static __init void early_pci_serial_init(char *s) /* * Determine if it is IO or memory mapped */ - if (bar0 & 0x01) { + if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) { /* it is IO mapped */ serial_in = io_serial_in; serial_out = io_serial_out; - early_serial_base = bar0&0xfffffffc; + early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK; write_pci_config(bus, slot, func, PCI_COMMAND, - cmdreg|PCI_COMMAND_IO); + cmdreg|PCI_COMMAND_IO); } else { /* It is memory mapped - assume 32-bit alignment */ serial_in = mem32_serial_in; serial_out = mem32_serial_out; /* WARNING! assuming the address is always in the first 4G */ early_serial_base = - (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10); + (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10); write_pci_config(bus, slot, func, PCI_COMMAND, - cmdreg|PCI_COMMAND_MEMORY); + cmdreg|PCI_COMMAND_MEMORY); } /* diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 9417d5aa7305..16f9814c9be0 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -94,17 +94,7 @@ static inline unsigned long espfix_base_addr(unsigned int cpu) static void init_espfix_random(void) { - unsigned long rand; - - /* - * This is run before the entropy pools are initialized, - * but this is hopefully better than nothing. - */ - if (!arch_get_random_longs(&rand, 1)) { - /* The constant is an arbitrary large prime */ - rand = rdtsc(); - rand *= 0xc345c6b72fd16123UL; - } + unsigned long rand = get_random_long(); slot_random = rand % ESPFIX_STACKS_PER_PAGE; page_random = (rand / ESPFIX_STACKS_PER_PAGE) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3b28c5b25e12..9baa89a8877d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -391,8 +391,6 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, { struct fpstate *kstate = gfpu->fpstate; const union fpregs_state *ustate = buf; - struct pkru_state *xpkru; - int ret; if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) @@ -406,16 +404,15 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, if (ustate->xsave.header.xfeatures & ~xcr0) return -EINVAL; - ret = copy_uabi_from_kernel_to_xstate(kstate, ustate); - if (ret) - return ret; + /* + * Nullify @vpkru to preserve its current value if PKRU's bit isn't set + * in the header. KVM's odd ABI is to leave PKRU untouched in this + * case (all other components are eventually re-initialized). + */ + if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU)) + vpkru = NULL; - /* Retrieve PKRU if not in init state */ - if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) { - xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU); - *vpkru = xpkru->pkru; - } - return 0; + return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); } EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); #endif /* CONFIG_KVM */ @@ -605,9 +602,9 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); save_fpregs_to_fpstate(dst_fpu); + fpregs_unlock(); if (!(clone_flags & CLONE_THREAD)) fpu_inherit_perms(dst_fpu); - fpregs_unlock(); /* * Children never inherit PASID state. diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 621f4b6cac4a..851eb13edc01 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -133,9 +133,6 @@ static void __init fpu__init_system_generic(void) fpu__init_system_mxcsr(); } -/* Get alignment of the TYPE. */ -#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) - /* * Enforce that 'MEMBER' is the last field of 'TYPE'. * @@ -143,8 +140,8 @@ static void __init fpu__init_system_generic(void) * because that's how C aligns structs. */ #define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ - BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \ - TYPE_ALIGN(TYPE))) + BUILD_BUG_ON(sizeof(TYPE) != \ + ALIGN(offsetofend(TYPE, MEMBER), _Alignof(TYPE))) /* * We append the 'struct fpu' to the task_struct: @@ -210,13 +207,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpstate_reset(¤t->thread.fpu); } -static void __init fpu__init_init_fpstate(void) -{ - /* Bring init_fpstate size and features up to date */ - init_fpstate.size = fpu_kernel_cfg.max_size; - init_fpstate.xfeatures = fpu_kernel_cfg.max_features; -} - /* * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: @@ -236,5 +226,4 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(fpu_kernel_cfg.max_size); fpu__init_task_struct_size(); - fpu__init_init_fpstate(); } diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 75ffaef8c299..6d056b68f4ed 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -167,7 +167,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } fpu_force_restore(fpu); - ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf); + ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf, &target->thread.pkru); out: vfree(tmpbuf); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 91d4b6de58ab..558076dbde5b 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -396,7 +396,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, fpregs = &fpu->fpstate->regs; if (use_xsave() && !fx_only) { - if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx)) + if (copy_sigframe_from_user_to_xstate(tsk, buf_fx)) return false; } else { if (__copy_from_user(&fpregs->fxsave, buf_fx, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c8340156bfd2..714166cc25f2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -360,7 +360,7 @@ static void __init setup_init_fpu_buf(void) print_xstate_features(); - xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features); + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); /* * Init all the features state with header.xfeatures being 0x0 @@ -440,8 +440,8 @@ static void __init __xstate_dump_leaves(void) } } -#define XSTATE_WARN_ON(x) do { \ - if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \ +#define XSTATE_WARN_ON(x, fmt, ...) do { \ + if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ __xstate_dump_leaves(); \ } \ } while (0) @@ -554,8 +554,7 @@ static bool __init check_xstate_against_struct(int nr) (nr >= XFEATURE_MAX) || (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { - WARN_ONCE(1, "no structure for xstate: %d\n", nr); - XSTATE_WARN_ON(1); + XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } return true; @@ -598,12 +597,13 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * XSAVES. */ if (!xsaves && xfeature_is_supervisor(i)) { - XSTATE_WARN_ON(1); + XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); return false; } } size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); - XSTATE_WARN_ON(size != kernel_size); + XSTATE_WARN_ON(size != kernel_size, + "size %u != kernel_size %u\n", size, kernel_size); return size == kernel_size; } @@ -678,20 +678,6 @@ static unsigned int __init get_xsave_size_user(void) return ebx; } -/* - * Will the runtime-enumerated 'xstate_size' fit in the init - * task's statically-allocated buffer? - */ -static bool __init is_supported_xstate_size(unsigned int test_xstate_size) -{ - if (test_xstate_size <= sizeof(init_fpstate.regs)) - return true; - - pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", - sizeof(init_fpstate.regs), test_xstate_size); - return false; -} - static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ @@ -717,10 +703,6 @@ static int __init init_xstate_size(void) kernel_default_size = xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); - /* Ensure we have the space to store all default enabled features. */ - if (!is_supported_xstate_size(kernel_default_size)) - return -EINVAL; - if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; @@ -875,6 +857,19 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); + /* + * init_fpstate excludes dynamic states as they are large but init + * state is zero. + */ + init_fpstate.size = fpu_kernel_cfg.default_size; + init_fpstate.xfeatures = fpu_kernel_cfg.default_features; + + if (init_fpstate.size > sizeof(init_fpstate.regs)) { + pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", + sizeof(init_fpstate.regs), init_fpstate.size); + goto out_disable; + } + setup_init_fpu_buf(); /* @@ -1130,6 +1125,15 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, */ mask = fpstate->user_xfeatures; + /* + * Dynamic features are not present in init_fpstate. When they are + * in an all zeros init state, remove those from 'mask' to zero + * those features in the user buffer instead of retrieving them + * from init_fpstate. + */ + if (fpu_state_size_dynamic()) + mask &= (header.xfeatures | xinit->header.xcomp_bv); + for_each_extended_xfeature(i, mask) { /* * If there was a feature or alignment gap, zero the space @@ -1196,8 +1200,36 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, } +/** + * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate + * @fpstate: The fpstate buffer to copy to + * @kbuf: The UABI format buffer, if it comes from the kernel + * @ubuf: The UABI format buffer, if it comes from userspace + * @pkru: The location to write the PKRU value to + * + * Converts from the UABI format into the kernel internal hardware + * dependent format. + * + * This function ultimately has three different callers with distinct PKRU + * behavior. + * 1. When called from sigreturn the PKRU register will be restored from + * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to + * @fpstate is sufficient to cover this case, but the caller will also + * pass a pointer to the thread_struct's pkru field in @pkru and updating + * it is harmless. + * 2. When called from ptrace the PKRU register will be restored from the + * thread_struct's pkru field. A pointer to that is passed in @pkru. + * The kernel will restore it manually, so the XRSTOR behavior that resets + * the PKRU register to the hardware init value (0) if the corresponding + * xfeatures bit is not set is emulated here. + * 3. When called from KVM the PKRU register will be restored from the vcpu's + * pkru field. A pointer to that is passed in @pkru. KVM hasn't used + * XRSTOR and hasn't had the PKRU resetting behavior described above. To + * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures + * bit is not set. + */ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, - const void __user *ubuf) + const void __user *ubuf, u32 *pkru) { struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; @@ -1246,6 +1278,20 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, } } + if (hdr.xfeatures & XFEATURE_MASK_PKRU) { + struct pkru_state *xpkru; + + xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); + *pkru = xpkru->pkru; + } else { + /* + * KVM may pass NULL here to indicate that it does not need + * PKRU updated. + */ + if (pkru) + *pkru = 0; + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': @@ -1264,9 +1310,9 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] * format and copy to the target thread. Used by ptrace and KVM. */ -int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) +int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) { - return copy_uabi_to_xstate(fpstate, kbuf, NULL); + return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); } /* @@ -1274,10 +1320,10 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ -int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, +int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { - return copy_uabi_to_xstate(fpstate, NULL, ubuf); + return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 5ad47031383b..a4ecb04d8d64 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -46,8 +46,8 @@ extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u32 pkru_val, enum xstate_copy_mode copy_mode); extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode mode); -extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf); -extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf); +extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru); +extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf); extern void fpu__init_cpu_xstate(void); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index bd165004776d..5e7ead52cfdb 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,10 +24,10 @@ #include <linux/module.h> #include <linux/memory.h> #include <linux/vmalloc.h> +#include <linux/set_memory.h> #include <trace/syscall.h> -#include <asm/set_memory.h> #include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> @@ -69,6 +69,10 @@ static const char *ftrace_nop_replace(void) static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) { + /* + * No need to translate into a callthunk. The trampoline does + * the depth accounting itself. + */ return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr); } @@ -217,7 +221,9 @@ void ftrace_replace_code(int enable) ret = ftrace_verify_code(rec->ip, old); if (ret) { + ftrace_expected = old; ftrace_bug(ret, rec); + ftrace_expected = NULL; return; } } @@ -317,7 +323,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long size; unsigned long *ptr; void *trampoline; - void *ip; + void *ip, *dest; /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; @@ -359,7 +365,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, &__x86_return_thunk, JMP32_INSN_SIZE); + __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else memcpy(ip, retq, sizeof(retq)); @@ -404,20 +410,20 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* put in the call to the function */ mutex_lock(&text_mutex); call_offset -= start_offset; + /* + * No need to translate into a callthunk. The trampoline does + * the depth accounting before the call already. + */ + dest = ftrace_ops_get_func(ops); memcpy(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, - trampoline + call_offset, - ftrace_ops_get_func(ops)), CALL_INSN_SIZE); + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; - set_vm_flush_reset_perms(trampoline); - - if (likely(system_state != SYSTEM_BOOTING)) - set_memory_ro((unsigned long)trampoline, npages); - set_memory_x((unsigned long)trampoline, npages); + set_memory_rox((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: tramp_free(trampoline); diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index dfeb227de561..1265ad519249 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -3,7 +3,9 @@ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc */ +#include <linux/cfi_types.h> #include <linux/linkage.h> +#include <asm/asm-offsets.h> #include <asm/ptrace.h> #include <asm/ftrace.h> #include <asm/export.h> @@ -129,9 +131,20 @@ .endm +SYM_TYPED_FUNC_START(ftrace_stub) + CALL_DEPTH_ACCOUNT + RET +SYM_FUNC_END(ftrace_stub) + +SYM_TYPED_FUNC_START(ftrace_stub_graph) + CALL_DEPTH_ACCOUNT + RET +SYM_FUNC_END(ftrace_stub_graph) + #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) + CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) @@ -140,6 +153,8 @@ SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs + CALL_DEPTH_ACCOUNT + /* Stack - skipping return address of ftrace_caller */ leaq MCOUNT_REG_SIZE+8(%rsp), %rcx movq %rcx, RSP(%rsp) @@ -155,6 +170,9 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) /* Only ops with REGS flag set should have CS register set */ movq $0, CS(%rsp) + /* Account for the function call below */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) ANNOTATE_NOENDBR call ftrace_stub @@ -172,21 +190,10 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) */ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_caller); STACK_FRAME_NON_STANDARD_FP(ftrace_caller) -SYM_FUNC_START(ftrace_epilogue) -/* - * This is weak to keep gas from relaxing the jumps. - */ -SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) - UNWIND_HINT_FUNC - ENDBR - RET -SYM_FUNC_END(ftrace_epilogue) - SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq @@ -195,6 +202,8 @@ SYM_FUNC_START(ftrace_regs_caller) save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) ANNOTATE_NOENDBR /* Load the ftrace_ops into the 3rd parameter */ @@ -225,6 +234,9 @@ SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) /* regs go into 4th parameter */ leaq (%rsp), %rcx + /* Account for the function call below */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) ANNOTATE_NOENDBR call ftrace_stub @@ -262,14 +274,11 @@ SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) popfq /* - * As this jmp to ftrace_epilogue can be a short jump - * it must not be copied into the trampoline. - * The trampoline will add the code to jump - * to the return. + * The trampoline will add the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - jmp ftrace_epilogue + RET /* Swap the flags with orig_rax */ 1: movq MCOUNT_REG_SIZE(%rsp), %rdi @@ -280,7 +289,20 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) /* Restore flags */ popfq UNWIND_HINT_FUNC - jmp ftrace_epilogue + + /* + * The above left an extra return value on the stack; effectively + * doing a tail-call without using a register. This PUSH;RET + * pattern unbalances the RSB, inject a pointless CALL to rebalance. + */ + ANNOTATE_INTRA_FUNCTION_CALL + CALL .Ldo_rebalance + int3 +.Ldo_rebalance: + add $8, %rsp + ALTERNATIVE __stringify(RET), \ + __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \ + X86_FEATURE_CALL_DEPTH SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) @@ -289,11 +311,10 @@ STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ SYM_FUNC_START(__fentry__) + CALL_DEPTH_ACCOUNT + cmpq $ftrace_stub, ftrace_trace_function jnz trace - -SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) - ENDBR RET trace: @@ -345,6 +366,8 @@ SYM_CODE_START(return_to_handler) int3 .Ldo_rop: mov %rdi, (%rsp) - RET + ALTERNATIVE __stringify(RET), \ + __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \ + X86_FEATURE_CALL_DEPTH SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6a3cfaf6b72a..387e4b12e823 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -203,7 +203,7 @@ unsigned long __head __startup_64(unsigned long physaddr, load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); /* Is the address not 2M aligned? */ - if (load_delta & ~PMD_PAGE_MASK) + if (load_delta & ~PMD_MASK) for (;;); /* Include the SME encryption mask in the fixup value */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9b7acc9c7874..67c8ed99144b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -261,16 +261,6 @@ SYM_FUNC_START(startup_32_smp) addl $__PAGE_OFFSET, %esp /* - * start system 32-bit setup. We need to re-do some of the things done - * in 16-bit mode for the "real" operations. - */ - movl setup_once_ref,%eax - andl %eax,%eax - jz 1f # Did we do this already? - call *%eax -1: - -/* * Check if it is 486 */ movb $4,X86 # at least 486 @@ -331,18 +321,7 @@ SYM_FUNC_END(startup_32_smp) #include "verify_cpu.S" -/* - * setup_once - * - * The setup work we only want to run on the BSP. - * - * Warning: %esi is live across this function. - */ __INIT -setup_once: - andl $0,setup_once_ref /* Once is enough, thanks */ - RET - SYM_FUNC_START(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs @@ -458,7 +437,6 @@ SYM_DATA(early_recursion_flag, .long 0) __REFDATA .align 4 SYM_DATA(initial_code, .long i386_start_kernel) -SYM_DATA(setup_once_ref, .long setup_once) #ifdef CONFIG_PAGE_TABLE_ISOLATION #define PGD_ALIGN (2 * PAGE_SIZE) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d860d437631b..222efd4a09bc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -370,6 +370,7 @@ SYM_CODE_END(secondary_startup_64) * start_secondary() via .Ljump_to_C_code. */ SYM_CODE_START(start_cpu0) + ANNOTATE_NOENDBR UNWIND_HINT_EMPTY movq initial_stack(%rip), %rsp jmp .Ljump_to_C_code diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 668a4a6533d9..bbb0f737aab1 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -266,7 +266,7 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) /* CPU entry erea is always used for CPU entry */ if (within_area(addr, end, CPU_ENTRY_AREA_BASE, - CPU_ENTRY_AREA_TOTAL_SIZE)) + CPU_ENTRY_AREA_MAP_SIZE)) return true; /* diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 15aefa3f3e18..3aa5304200c5 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -407,7 +407,7 @@ struct legacy_pic null_legacy_pic = { .make_irq = legacy_pic_uint_noop, }; -struct legacy_pic default_legacy_pic = { +static struct legacy_pic default_legacy_pic = { .nr_legacy_irqs = NR_IRQS_LEGACY, .chip = &i8259A_chip, .mask = mask_8259A_irq, diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 01833ebf5e8e..dc1049c01f9b 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -52,9 +52,6 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); - static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" @@ -77,7 +74,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(hardirq_stack_ptr); + irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -115,7 +112,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) int node = cpu_to_node(cpu); struct page *ph, *ps; - if (per_cpu(hardirq_stack_ptr, cpu)) + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) return 0; ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); @@ -127,8 +124,8 @@ int irq_init_percpu_irqstack(unsigned int cpu) return -ENOMEM; } - per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); - per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps); return 0; } @@ -138,7 +135,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(softirq_stack_ptr); + irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1c0fb96b9e39..fe0c859873d1 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -50,7 +50,7 @@ static int map_irq_stack(unsigned int cpu) return -ENOMEM; /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #else @@ -63,14 +63,14 @@ static int map_irq_stack(unsigned int cpu) void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #endif int irq_init_percpu_irqstack(unsigned int cpu) { - if (per_cpu(hardirq_stack_ptr, cpu)) + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) return 0; return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4c3c27b6aea3..66299682b6b7 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -43,6 +43,7 @@ #include <linux/objtool.h> #include <linux/vmalloc.h> #include <linux/pgtable.h> +#include <linux/set_memory.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -51,7 +52,6 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> -#include <asm/set_memory.h> #include <asm/ibt.h> #include "common.h" @@ -59,8 +59,6 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#define stack_addr(regs) ((unsigned long *)regs->sp) - #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ @@ -416,18 +414,11 @@ void *alloc_insn_page(void) if (!page) return NULL; - set_vm_flush_reset_perms(page); - /* - * First make the page read-only, and only then make it executable to - * prevent it from being W+X in between. - */ - set_memory_ro((unsigned long)page, 1); - /* * TODO: Once additional kernel code protection mechanisms are set, ensure * that the page was not maliciously altered and it is still zeroed. */ - set_memory_x((unsigned long)page, 1); + set_memory_rox((unsigned long)page, 1); return page; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d4e48b4a438b..1cceac5984da 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -349,7 +349,7 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { - u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + u64 pa; WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); @@ -798,19 +798,13 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and * restoring to/from the stack. */ -asm( -".pushsection .text;" -".global __raw_callee_save___kvm_vcpu_is_preempted;" -".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" -"__raw_callee_save___kvm_vcpu_is_preempted:" -ASM_ENDBR -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" -"setne %al;" -ASM_RET -".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" -".popsection"); +#define PV_VCPU_PREEMPTED_ASM \ + "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \ + "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \ + "setne %al\n\t" +DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted, + PV_VCPU_PREEMPTED_ASM, .text); #endif static void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b1abf663417c..705fb2a41d7d 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -53,7 +53,7 @@ static unsigned long int get_module_load_offset(void) */ if (module_load_offset == 0) module_load_offset = - (get_random_int() % 1024 + 1) * PAGE_SIZE; + get_random_u32_inclusive(1, 1024) * PAGE_SIZE; mutex_unlock(&module_kaslr_mutex); } return module_load_offset; @@ -74,10 +74,11 @@ void *module_alloc(unsigned long size) return NULL; p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, - PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, - __builtin_return_address(0)); + MODULES_VADDR + get_module_load_offset(), + MODULES_END, gfp_mask, PAGE_KERNEL, + VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, + NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; @@ -251,14 +252,13 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + const Elf_Shdr *s, *alt = NULL, *locks = NULL, *para = NULL, *orc = NULL, *orc_ip = NULL, - *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL; + *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, + *calls = NULL, *cfi = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".text", secstrings + s->sh_name)) - text = s; if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) @@ -273,6 +273,10 @@ int module_finalize(const Elf_Ehdr *hdr, retpolines = s; if (!strcmp(".return_sites", secstrings + s->sh_name)) returns = s; + if (!strcmp(".call_sites", secstrings + s->sh_name)) + calls = s; + if (!strcmp(".cfi_sites", secstrings + s->sh_name)) + cfi = s; if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) ibt_endbr = s; } @@ -285,6 +289,22 @@ int module_finalize(const Elf_Ehdr *hdr, void *pseg = (void *)para->sh_addr; apply_paravirt(pseg, pseg + para->sh_size); } + if (retpolines || cfi) { + void *rseg = NULL, *cseg = NULL; + unsigned int rsize = 0, csize = 0; + + if (retpolines) { + rseg = (void *)retpolines->sh_addr; + rsize = retpolines->sh_size; + } + + if (cfi) { + cseg = (void *)cfi->sh_addr; + csize = cfi->sh_size; + } + + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); + } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); @@ -298,16 +318,32 @@ int module_finalize(const Elf_Ehdr *hdr, void *aseg = (void *)alt->sh_addr; apply_alternatives(aseg, aseg + alt->sh_size); } + if (calls || para) { + struct callthunk_sites cs = {}; + + if (calls) { + cs.call_start = (void *)calls->sh_addr; + cs.call_end = (void *)calls->sh_addr + calls->sh_size; + } + + if (para) { + cs.pv_start = (void *)para->sh_addr; + cs.pv_end = (void *)para->sh_addr + para->sh_size; + } + + callthunks_patch_module_calls(&cs, me); + } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size); } - if (locks && text) { + if (locks) { void *lseg = (void *)locks->sh_addr; - void *tseg = (void *)text->sh_addr; + void *text = me->core_layout.base; + void *text_end = text + me->core_layout.text_size; alternatives_smp_module_add(me, me->name, lseg, lseg + locks->sh_size, - tseg, tseg + text->sh_size); + text, text_end); } if (orc && orc_ip) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index ed8ac6bcbafb..708751311786 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -250,7 +250,7 @@ static int msr_device_destroy(unsigned int cpu) return 0; } -static char *msr_devnode(struct device *dev, umode_t *mode) +static char *msr_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 7ca2d46c08cc..327757afb027 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -37,27 +37,10 @@ * nop stub, which must not clobber anything *including the stack* to * avoid confusing the entry prologues. */ -extern void _paravirt_nop(void); -asm (".pushsection .entry.text, \"ax\"\n" - ".global _paravirt_nop\n" - "_paravirt_nop:\n\t" - ASM_ENDBR - ASM_RET - ".size _paravirt_nop, . - _paravirt_nop\n\t" - ".type _paravirt_nop, @function\n\t" - ".popsection"); +DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text); /* stub always returning 0. */ -asm (".pushsection .entry.text, \"ax\"\n" - ".global paravirt_ret0\n" - "paravirt_ret0:\n\t" - ASM_ENDBR - "xor %" _ASM_AX ", %" _ASM_AX ";\n\t" - ASM_RET - ".size paravirt_ret0, . - paravirt_ret0\n\t" - ".type paravirt_ret0, @function\n\t" - ".popsection"); - +DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 58a6ea472db9..40d156a31676 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -600,7 +600,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, } if (updmsr) - write_spec_ctrl_current(msr, false); + update_spec_ctrl_cond(msr); } static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) @@ -965,7 +965,7 @@ early_param("idle", idle_setup); unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; + sp -= get_random_u32_below(8192); return sp & ~0xf; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 2f314b170c9f..470c128759ea 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -191,13 +191,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0 and cpu_current_top_of_stack. This changes + * Reload esp0 and pcpu_hot.top_of_stack. This changes * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ update_task_stack(next_p); refresh_sysenter_cs(next); - this_cpu_write(cpu_current_top_of_stack, + this_cpu_write(pcpu_hot.top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); @@ -207,7 +207,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) loadsegment(gs, next->gs); - this_cpu_write(current_task, next_p); + raw_cpu_write(pcpu_hot.current_task, next_p); switch_fpu_finish(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1962008fe743..4e34b3b68ebd 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -165,7 +165,7 @@ static noinstr unsigned long __rdgsbase_inactive(void) lockdep_assert_irqs_disabled(); - if (!static_cpu_has(X86_FEATURE_XENPV)) { + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); gsbase = rdgsbase(); native_swapgs(); @@ -190,7 +190,7 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase) { lockdep_assert_irqs_disabled(); - if (!static_cpu_has(X86_FEATURE_XENPV)) { + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); wrgsbase(gsbase); native_swapgs(); @@ -553,6 +553,7 @@ void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ +__no_kmsan_checks __visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { @@ -562,7 +563,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(hardirq_stack_inuse)); + this_cpu_read(pcpu_hot.hardirq_stack_inuse)); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); @@ -616,8 +617,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - this_cpu_write(current_task, next_p); - this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + raw_cpu_write(pcpu_hot.current_task, next_p); + raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 37c12fb92906..dfaa270a7cc9 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -44,16 +44,35 @@ #include "tls.h" -enum x86_regset { - REGSET_GENERAL, - REGSET_FP, - REGSET_XFP, - REGSET_IOPERM64 = REGSET_XFP, - REGSET_XSTATE, - REGSET_TLS, - REGSET_IOPERM32, +enum x86_regset_32 { + REGSET32_GENERAL, + REGSET32_FP, + REGSET32_XFP, + REGSET32_XSTATE, + REGSET32_TLS, + REGSET32_IOPERM, }; +enum x86_regset_64 { + REGSET64_GENERAL, + REGSET64_FP, + REGSET64_IOPERM, + REGSET64_XSTATE, +}; + +#define REGSET_GENERAL \ +({ \ + BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \ + REGSET32_GENERAL; \ +}) + +#define REGSET_FP \ +({ \ + BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \ + REGSET32_FP; \ +}) + + struct pt_regs_offset { const char *name; int offset; @@ -788,13 +807,13 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef CONFIG_X86_32 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, - REGSET_XFP, + REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, - REGSET_XFP, + REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; #endif @@ -1086,13 +1105,13 @@ static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, - REGSET_XFP, 0, + REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, - REGSET_XFP, 0, + REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); @@ -1215,29 +1234,38 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 static struct user_regset x86_64_regsets[] __ro_after_init = { - [REGSET_GENERAL] = { - .core_note_type = NT_PRSTATUS, - .n = sizeof(struct user_regs_struct) / sizeof(long), - .size = sizeof(long), .align = sizeof(long), - .regset_get = genregs_get, .set = genregs_set + [REGSET64_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .regset_get = genregs_get, + .set = genregs_set }, - [REGSET_FP] = { - .core_note_type = NT_PRFPREG, - .n = sizeof(struct fxregs_state) / sizeof(long), - .size = sizeof(long), .align = sizeof(long), - .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set + [REGSET64_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct fxregs_state) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .active = regset_xregset_fpregs_active, + .regset_get = xfpregs_get, + .set = xfpregs_set }, - [REGSET_XSTATE] = { - .core_note_type = NT_X86_XSTATE, - .size = sizeof(u64), .align = sizeof(u64), - .active = xstateregs_active, .regset_get = xstateregs_get, - .set = xstateregs_set + [REGSET64_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), + .align = sizeof(u64), + .active = xstateregs_active, + .regset_get = xstateregs_get, + .set = xstateregs_set }, - [REGSET_IOPERM64] = { - .core_note_type = NT_386_IOPERM, - .n = IO_BITMAP_LONGS, - .size = sizeof(long), .align = sizeof(long), - .active = ioperm_active, .regset_get = ioperm_get + [REGSET64_IOPERM] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_LONGS, + .size = sizeof(long), + .align = sizeof(long), + .active = ioperm_active, + .regset_get = ioperm_get }, }; @@ -1256,43 +1284,57 @@ static const struct user_regset_view user_x86_64_view = { #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static struct user_regset x86_32_regsets[] __ro_after_init = { - [REGSET_GENERAL] = { - .core_note_type = NT_PRSTATUS, - .n = sizeof(struct user_regs_struct32) / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .regset_get = genregs32_get, .set = genregs32_set + [REGSET32_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct32) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .regset_get = genregs32_get, + .set = genregs32_set }, - [REGSET_FP] = { - .core_note_type = NT_PRFPREG, - .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set + [REGSET32_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = regset_fpregs_active, + .regset_get = fpregs_get, + .set = fpregs_set }, - [REGSET_XFP] = { - .core_note_type = NT_PRXFPREG, - .n = sizeof(struct fxregs_state) / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set + [REGSET32_XFP] = { + .core_note_type = NT_PRXFPREG, + .n = sizeof(struct fxregs_state) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = regset_xregset_fpregs_active, + .regset_get = xfpregs_get, + .set = xfpregs_set }, - [REGSET_XSTATE] = { - .core_note_type = NT_X86_XSTATE, - .size = sizeof(u64), .align = sizeof(u64), - .active = xstateregs_active, .regset_get = xstateregs_get, - .set = xstateregs_set + [REGSET32_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), + .align = sizeof(u64), + .active = xstateregs_active, + .regset_get = xstateregs_get, + .set = xstateregs_set }, - [REGSET_TLS] = { - .core_note_type = NT_386_TLS, - .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, - .size = sizeof(struct user_desc), - .align = sizeof(struct user_desc), - .active = regset_tls_active, - .regset_get = regset_tls_get, .set = regset_tls_set + [REGSET32_TLS] = { + .core_note_type = NT_386_TLS, + .n = GDT_ENTRY_TLS_ENTRIES, + .bias = GDT_ENTRY_TLS_MIN, + .size = sizeof(struct user_desc), + .align = sizeof(struct user_desc), + .active = regset_tls_active, + .regset_get = regset_tls_get, + .set = regset_tls_set }, - [REGSET_IOPERM32] = { - .core_note_type = NT_386_IOPERM, - .n = IO_BITMAP_BYTES / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .active = ioperm_active, .regset_get = ioperm_get + [REGSET32_IOPERM] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_BYTES / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = ioperm_active, + .regset_get = ioperm_get }, }; @@ -1311,10 +1353,10 @@ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 - x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); + x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64); #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION - x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64); + x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64); #endif xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 4809c0dc4eb0..4a73351f87f8 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -41,6 +41,7 @@ .text .align PAGE_SIZE .code64 +SYM_CODE_START_NOALIGN(relocate_range) SYM_CODE_START_NOALIGN(relocate_kernel) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR @@ -312,5 +313,5 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) int3 SYM_CODE_END(swap_pages) - .globl kexec_control_code_size -.set kexec_control_code_size, . - relocate_kernel + .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc +SYM_CODE_END(relocate_range); diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index bba1abd05bfe..79bc8a97a083 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -42,8 +42,16 @@ static void remove_e820_regions(struct resource *avail) resource_clip(avail, e820_start, e820_end); if (orig.start != avail->start || orig.end != avail->end) { - pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", - &orig, avail, e820_start, e820_end); + pr_info("resource: avoiding allocation from e820 entry [mem %#010Lx-%#010Lx]\n", + e820_start, e820_end); + if (avail->end > avail->start) + /* + * Use %pa instead of %pR because "avail" + * is typically IORESOURCE_UNSET, so %pR + * shows the size instead of addresses. + */ + pr_info("resource: remaining [mem %pa-%pa] available\n", + &avail->start, &avail->end); orig = *avail; } } diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 586f718b8e95..349046434513 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -4,11 +4,8 @@ */ #include <linux/platform_device.h> #include <linux/mc146818rtc.h> -#include <linux/acpi.h> -#include <linux/bcd.h> #include <linux/export.h> #include <linux/pnp.h> -#include <linux/of.h> #include <asm/vsyscall.h> #include <asm/x86_init.h> @@ -20,26 +17,23 @@ /* * This is a special lock that is owned by the CPU and holds the index * register we are working with. It is required for NMI access to the - * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + * CMOS/RTC registers. See arch/x86/include/asm/mc146818rtc.h for details. */ volatile unsigned long cmos_lock; EXPORT_SYMBOL(cmos_lock); #endif /* CONFIG_X86_32 */ -/* For two digit years assume time is always after that */ -#define CMOS_YEARS_OFFS 2000 - DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); /* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be + * In order to set the CMOS clock precisely, mach_set_cmos_time has to be * called 500 ms after the second nowtime has started, because when * nowtime is written into the registers of the CMOS clock, it will * jump to the next second precisely 500 ms later. Check the Motorola * MC146818A or Dallas DS12887 data sheet for details. */ -int mach_set_rtc_mmss(const struct timespec64 *now) +int mach_set_cmos_time(const struct timespec64 *now) { unsigned long long nowtime = now->tv_sec; struct rtc_time tm; @@ -62,8 +56,7 @@ int mach_set_rtc_mmss(const struct timespec64 *now) void mach_get_cmos_time(struct timespec64 *now) { - unsigned int status, year, mon, day, hour, min, sec, century = 0; - unsigned long flags; + struct rtc_time tm; /* * If pm_trace abused the RTC as storage, set the timespec to 0, @@ -74,51 +67,13 @@ void mach_get_cmos_time(struct timespec64 *now) return; } - spin_lock_irqsave(&rtc_lock, flags); - - /* - * If UIP is clear, then we have >= 244 microseconds before - * RTC registers will be updated. Spec sheet says that this - * is the reliable way to read RTC - registers. If UIP is set - * then the register access might be invalid. - */ - while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) - cpu_relax(); - - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); - -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - acpi_gbl_FADT.century) - century = CMOS_READ(acpi_gbl_FADT.century); -#endif - - status = CMOS_READ(RTC_CONTROL); - WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); - - spin_unlock_irqrestore(&rtc_lock, flags); - - if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { - sec = bcd2bin(sec); - min = bcd2bin(min); - hour = bcd2bin(hour); - day = bcd2bin(day); - mon = bcd2bin(mon); - year = bcd2bin(year); + if (mc146818_get_time(&tm)) { + pr_err("Unable to read current time from RTC\n"); + now->tv_sec = now->tv_nsec = 0; + return; } - if (century) { - century = bcd2bin(century); - year += century * 100; - } else - year += CMOS_YEARS_OFFS; - - now->tv_sec = mktime64(year, mon, day, hour, min, sec); + now->tv_sec = rtc_tm_to_time64(&tm); now->tv_nsec = 0; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 216fee7144ee..88188549647c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -31,9 +31,11 @@ #include <xen/xen.h> #include <asm/apic.h> +#include <asm/efi.h> #include <asm/numa.h> #include <asm/bios_ebda.h> #include <asm/bugs.h> +#include <asm/cacheinfo.h> #include <asm/cpu.h> #include <asm/efi.h> #include <asm/gart.h> @@ -1074,24 +1076,13 @@ void __init setup_arch(char **cmdline_p) max_pfn = e820__end_of_ram_pfn(); /* update e820 for memory not covered by WB MTRRs */ - if (IS_ENABLED(CONFIG_MTRR)) - mtrr_bp_init(); - else - pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel."); - + cache_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) max_pfn = e820__end_of_ram_pfn(); max_possible_pfn = max_pfn; /* - * This call is required when the CPU does not support PAT. If - * mtrr_bp_init() invoked it already via pat_init() the call has no - * effect. - */ - init_cache_modes(); - - /* * Define random base addresses for memory sections after max_pfn is * defined and before each memory section base is used. */ @@ -1175,7 +1166,7 @@ void __init setup_arch(char **cmdline_p) * Moreover, on machines with SandyBridge graphics or in setups that use * crashkernel the entire 1M is reserved anyway. */ - reserve_real_mode(); + x86_platform.realmode_reserve(); init_mem_mapping(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 49325caa7307..c242dc47e9cb 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -11,6 +11,7 @@ #include <linux/smp.h> #include <linux/topology.h> #include <linux/pfn.h> +#include <linux/stackprotector.h> #include <asm/sections.h> #include <asm/processor.h> #include <asm/desc.h> @@ -21,10 +22,6 @@ #include <asm/proto.h> #include <asm/cpumask.h> #include <asm/cpu.h> -#include <asm/stackprotector.h> - -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); -EXPORT_PER_CPU_SYMBOL(cpu_number); #ifdef CONFIG_X86_64 #define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) @@ -172,7 +169,7 @@ void __init setup_per_cpu_areas(void) for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); - per_cpu(cpu_number, cpu) = cpu; + per_cpu(pcpu_hot.cpu_number, cpu) = cpu; setup_percpu_segment(cpu); /* * Copy data used in early init routines from the @@ -211,7 +208,7 @@ void __init setup_per_cpu_areas(void) * area. Reload any changed state for the boot CPU. */ if (!cpu) - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); } /* indicate the early static arrays will soon be gone */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 9c7265b524c7..1504eb8d25aa 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -37,180 +37,27 @@ #include <asm/sighandling.h> #include <asm/vm86.h> -#ifdef CONFIG_X86_64 -#include <linux/compat.h> -#include <asm/proto.h> -#include <asm/ia32_unistd.h> -#include <asm/fpu/xstate.h> -#endif /* CONFIG_X86_64 */ - #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> -#ifdef CONFIG_X86_64 -/* - * If regs->ss will cause an IRET fault, change it. Otherwise leave it - * alone. Using this generally makes no sense unless - * user_64bit_mode(regs) would return true. - */ -static void force_valid_ss(struct pt_regs *regs) +static inline int is_ia32_compat_frame(struct ksignal *ksig) { - u32 ar; - asm volatile ("lar %[old_ss], %[ar]\n\t" - "jz 1f\n\t" /* If invalid: */ - "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ - "1:" - : [ar] "=r" (ar) - : [old_ss] "rm" ((u16)regs->ss)); - - /* - * For a valid 64-bit user context, we need DPL 3, type - * read-write data or read-write exp-down data, and S and P - * set. We can't use VERW because VERW doesn't check the - * P bit. - */ - ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; - if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && - ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) - regs->ss = __USER_DS; + return IS_ENABLED(CONFIG_IA32_EMULATION) && + ksig->ka.sa.sa_flags & SA_IA32_ABI; } -# define CONTEXT_COPY_SIZE offsetof(struct sigcontext, reserved1) -#else -# define CONTEXT_COPY_SIZE sizeof(struct sigcontext) -#endif -static bool restore_sigcontext(struct pt_regs *regs, - struct sigcontext __user *usc, - unsigned long uc_flags) +static inline int is_ia32_frame(struct ksignal *ksig) { - struct sigcontext sc; - - /* Always make any pending restarted system calls return -EINTR */ - current->restart_block.fn = do_no_restart_syscall; - - if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) - return false; - -#ifdef CONFIG_X86_32 - loadsegment(gs, sc.gs); - regs->fs = sc.fs; - regs->es = sc.es; - regs->ds = sc.ds; -#endif /* CONFIG_X86_32 */ - - regs->bx = sc.bx; - regs->cx = sc.cx; - regs->dx = sc.dx; - regs->si = sc.si; - regs->di = sc.di; - regs->bp = sc.bp; - regs->ax = sc.ax; - regs->sp = sc.sp; - regs->ip = sc.ip; - -#ifdef CONFIG_X86_64 - regs->r8 = sc.r8; - regs->r9 = sc.r9; - regs->r10 = sc.r10; - regs->r11 = sc.r11; - regs->r12 = sc.r12; - regs->r13 = sc.r13; - regs->r14 = sc.r14; - regs->r15 = sc.r15; -#endif /* CONFIG_X86_64 */ - - /* Get CS/SS and force CPL3 */ - regs->cs = sc.cs | 0x03; - regs->ss = sc.ss | 0x03; - - regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); - /* disable syscall checks */ - regs->orig_ax = -1; - -#ifdef CONFIG_X86_64 - /* - * Fix up SS if needed for the benefit of old DOSEMU and - * CRIU. - */ - if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) - force_valid_ss(regs); -#endif - - return fpu__restore_sig((void __user *)sc.fpstate, - IS_ENABLED(CONFIG_X86_32)); + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); } -static __always_inline int -__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, - struct pt_regs *regs, unsigned long mask) +static inline int is_x32_frame(struct ksignal *ksig) { -#ifdef CONFIG_X86_32 - unsigned int gs; - savesegment(gs, gs); - - unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault); - unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); - unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); - unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); -#endif /* CONFIG_X86_32 */ - - unsafe_put_user(regs->di, &sc->di, Efault); - unsafe_put_user(regs->si, &sc->si, Efault); - unsafe_put_user(regs->bp, &sc->bp, Efault); - unsafe_put_user(regs->sp, &sc->sp, Efault); - unsafe_put_user(regs->bx, &sc->bx, Efault); - unsafe_put_user(regs->dx, &sc->dx, Efault); - unsafe_put_user(regs->cx, &sc->cx, Efault); - unsafe_put_user(regs->ax, &sc->ax, Efault); -#ifdef CONFIG_X86_64 - unsafe_put_user(regs->r8, &sc->r8, Efault); - unsafe_put_user(regs->r9, &sc->r9, Efault); - unsafe_put_user(regs->r10, &sc->r10, Efault); - unsafe_put_user(regs->r11, &sc->r11, Efault); - unsafe_put_user(regs->r12, &sc->r12, Efault); - unsafe_put_user(regs->r13, &sc->r13, Efault); - unsafe_put_user(regs->r14, &sc->r14, Efault); - unsafe_put_user(regs->r15, &sc->r15, Efault); -#endif /* CONFIG_X86_64 */ - - unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); - unsafe_put_user(current->thread.error_code, &sc->err, Efault); - unsafe_put_user(regs->ip, &sc->ip, Efault); -#ifdef CONFIG_X86_32 - unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault); - unsafe_put_user(regs->flags, &sc->flags, Efault); - unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault); - unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault); -#else /* !CONFIG_X86_32 */ - unsafe_put_user(regs->flags, &sc->flags, Efault); - unsafe_put_user(regs->cs, &sc->cs, Efault); - unsafe_put_user(0, &sc->gs, Efault); - unsafe_put_user(0, &sc->fs, Efault); - unsafe_put_user(regs->ss, &sc->ss, Efault); -#endif /* CONFIG_X86_32 */ - - unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); - - /* non-iBCS2 extensions.. */ - unsafe_put_user(mask, &sc->oldmask, Efault); - unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); - return 0; -Efault: - return -EFAULT; + return IS_ENABLED(CONFIG_X86_X32_ABI) && + ksig->ka.sa.sa_flags & SA_X32_ABI; } -#define unsafe_put_sigcontext(sc, fp, regs, set, label) \ -do { \ - if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ - goto label; \ -} while(0); - -#define unsafe_put_sigmask(set, frame, label) \ - unsafe_put_user(*(__u64 *)(set), \ - (__u64 __user *)&(frame)->uc.uc_sigmask, \ - label) - /* * Set up a signal frame. */ @@ -223,24 +70,12 @@ do { \ /* * Determine which stack to use.. */ -static unsigned long align_sigframe(unsigned long sp) -{ -#ifdef CONFIG_X86_32 - /* - * Align the stack pointer according to the i386 ABI, - * i.e. so that on function entry ((sp + 4) & 15) == 0. - */ - sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; -#else /* !CONFIG_X86_32 */ - sp = round_down(sp, FRAME_ALIGNMENT) - 8; -#endif - return sp; -} - -static void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, +void __user * +get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { + struct k_sigaction *ka = &ksig->ka; + int ia32_frame = is_ia32_frame(ksig); /* Default to using normal stack */ bool nested_altstack = on_sig_stack(regs->sp); bool entering_altstack = false; @@ -249,7 +84,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long buf_fx = 0; /* redzone */ - if (IS_ENABLED(CONFIG_X86_64)) + if (!ia32_frame) sp -= 128; /* This is the X/Open sanctioned signal stack switching. */ @@ -263,7 +98,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = current->sas_ss_sp + current->sas_ss_size; entering_altstack = true; } - } else if (IS_ENABLED(CONFIG_X86_32) && + } else if (ia32_frame && !nested_altstack && regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && @@ -273,11 +108,19 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, entering_altstack = true; } - sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), - &buf_fx, &math_size); + sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size); *fpstate = (void __user *)sp; - sp = align_sigframe(sp - frame_size); + sp -= frame_size; + + if (ia32_frame) + /* + * Align the stack pointer according to the i386 ABI, + * i.e. so that on function entry ((sp + 4) & 15) == 0. + */ + sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; + else + sp = round_down(sp, FRAME_ALIGNMENT) - 8; /* * If we are on the alternate signal stack and would overflow it, don't. @@ -300,391 +143,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)sp; } -#ifdef CONFIG_X86_32 -static const struct { - u16 poplmovl; - u32 val; - u16 int80; -} __attribute__((packed)) retcode = { - 0xb858, /* popl %eax; movl $..., %eax */ - __NR_sigreturn, - 0x80cd, /* int $0x80 */ -}; - -static const struct { - u8 movl; - u32 val; - u16 int80; - u8 pad; -} __attribute__((packed)) rt_retcode = { - 0xb8, /* movl $..., %eax */ - __NR_rt_sigreturn, - 0x80cd, /* int $0x80 */ - 0 -}; - -static int -__setup_frame(int sig, struct ksignal *ksig, sigset_t *set, - struct pt_regs *regs) -{ - struct sigframe __user *frame; - void __user *restorer; - void __user *fp = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - unsafe_put_user(sig, &frame->sig, Efault); - unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault); - unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); - if (current->mm->context.vdso) - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_sigreturn; - else - restorer = &frame->retcode; - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; - - /* Set up to return from userspace. */ - unsafe_put_user(restorer, &frame->pretcode, Efault); - - /* - * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault); - user_access_end(); - - /* Set up registers for signal handler */ - regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ksig->ka.sa.sa_handler; - regs->ax = (unsigned long)sig; - regs->dx = 0; - regs->cx = 0; - - regs->ds = __USER_DS; - regs->es = __USER_DS; - regs->ss = __USER_DS; - regs->cs = __USER_CS; - - return 0; - -Efault: - user_access_end(); - return -EFAULT; -} - -static int __setup_rt_frame(int sig, struct ksignal *ksig, - sigset_t *set, struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - void __user *restorer; - void __user *fp = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - unsafe_put_user(sig, &frame->sig, Efault); - unsafe_put_user(&frame->info, &frame->pinfo, Efault); - unsafe_put_user(&frame->uc, &frame->puc, Efault); - - /* Create the ucontext. */ - if (static_cpu_has(X86_FEATURE_XSAVE)) - unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault); - else - unsafe_put_user(0, &frame->uc.uc_flags, Efault); - unsafe_put_user(0, &frame->uc.uc_link, Efault); - unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - - /* Set up to return from userspace. */ - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_rt_sigreturn; - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; - unsafe_put_user(restorer, &frame->pretcode, Efault); - - /* - * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault); - unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); - unsafe_put_sigmask(set, frame, Efault); - user_access_end(); - - if (copy_siginfo_to_user(&frame->info, &ksig->info)) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ksig->ka.sa.sa_handler; - regs->ax = (unsigned long)sig; - regs->dx = (unsigned long)&frame->info; - regs->cx = (unsigned long)&frame->uc; - - regs->ds = __USER_DS; - regs->es = __USER_DS; - regs->ss = __USER_DS; - regs->cs = __USER_CS; - - return 0; -Efault: - user_access_end(); - return -EFAULT; -} -#else /* !CONFIG_X86_32 */ -static unsigned long frame_uc_flags(struct pt_regs *regs) -{ - unsigned long flags; - - if (boot_cpu_has(X86_FEATURE_XSAVE)) - flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; - else - flags = UC_SIGCONTEXT_SS; - - if (likely(user_64bit_mode(regs))) - flags |= UC_STRICT_RESTORE_SS; - - return flags; -} - -static int __setup_rt_frame(int sig, struct ksignal *ksig, - sigset_t *set, struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - void __user *fp = NULL; - unsigned long uc_flags; - - /* x86-64 should always use SA_RESTORER. */ - if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) - return -EFAULT; - - frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); - uc_flags = frame_uc_flags(regs); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - /* Create the ucontext. */ - unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); - unsafe_put_user(0, &frame->uc.uc_link, Efault); - unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); - unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); - unsafe_put_sigmask(set, frame, Efault); - user_access_end(); - - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user(&frame->info, &ksig->info)) - return -EFAULT; - } - - /* Set up registers for signal handler */ - regs->di = sig; - /* In case the signal handler was declared without prototypes */ - regs->ax = 0; - - /* This also works for non SA_SIGINFO handlers because they expect the - next argument after the signal number on the stack. */ - regs->si = (unsigned long)&frame->info; - regs->dx = (unsigned long)&frame->uc; - regs->ip = (unsigned long) ksig->ka.sa.sa_handler; - - regs->sp = (unsigned long)frame; - - /* - * Set up the CS and SS registers to run signal handlers in - * 64-bit mode, even if the handler happens to be interrupting - * 32-bit or 16-bit code. - * - * SS is subtle. In 64-bit mode, we don't need any particular - * SS descriptor, but we do need SS to be valid. It's possible - * that the old SS is entirely bogus -- this can happen if the - * signal we're trying to deliver is #GP or #SS caused by a bad - * SS value. We also have a compatibility issue here: DOSEMU - * relies on the contents of the SS register indicating the - * SS value at the time of the signal, even though that code in - * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU - * avoids relying on sigreturn to restore SS; instead it uses - * a trampoline.) So we do our best: if the old SS was valid, - * we keep it. Otherwise we replace it. - */ - regs->cs = __USER_CS; - - if (unlikely(regs->ss != __USER_DS)) - force_valid_ss(regs); - - return 0; - -Efault: - user_access_end(); - return -EFAULT; -} -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_X86_X32_ABI -static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -{ - struct compat_siginfo new; - - copy_siginfo_to_external32(&new, from); - if (from->si_signo == SIGCHLD) { - new._sifields._sigchld_x32._utime = from->si_utime; - new._sifields._sigchld_x32._stime = from->si_stime; - } - if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) - return -EFAULT; - return 0; -} - -int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -{ - if (in_x32_syscall()) - return x32_copy_siginfo_to_user(to, from); - return __copy_siginfo_to_user32(to, from); -} -#endif /* CONFIG_X86_X32_ABI */ - -static int x32_setup_rt_frame(struct ksignal *ksig, - compat_sigset_t *set, - struct pt_regs *regs) -{ -#ifdef CONFIG_X86_X32_ABI - struct rt_sigframe_x32 __user *frame; - unsigned long uc_flags; - void __user *restorer; - void __user *fp = NULL; - - if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) - return -EFAULT; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - - uc_flags = frame_uc_flags(regs); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - /* Create the ucontext. */ - unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); - unsafe_put_user(0, &frame->uc.uc_link, Efault); - unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - unsafe_put_user(0, &frame->uc.uc__pad0, Efault); - restorer = ksig->ka.sa.sa_restorer; - unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); - unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); - unsafe_put_sigmask(set, frame, Efault); - user_access_end(); - - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) - return -EFAULT; - } - - /* Set up registers for signal handler */ - regs->sp = (unsigned long) frame; - regs->ip = (unsigned long) ksig->ka.sa.sa_handler; - - /* We use the x32 calling convention here... */ - regs->di = ksig->sig; - regs->si = (unsigned long) &frame->info; - regs->dx = (unsigned long) &frame->uc; - - loadsegment(ds, __USER_DS); - loadsegment(es, __USER_DS); - - regs->cs = __USER_CS; - regs->ss = __USER_DS; -#endif /* CONFIG_X86_X32_ABI */ - - return 0; -#ifdef CONFIG_X86_X32_ABI -Efault: - user_access_end(); - return -EFAULT; -#endif -} - -/* - * Do a signal return; undo the signal stack. - */ -#ifdef CONFIG_X86_32 -SYSCALL_DEFINE0(sigreturn) -{ - struct pt_regs *regs = current_pt_regs(); - struct sigframe __user *frame; - sigset_t set; - - frame = (struct sigframe __user *)(regs->sp - 8); - - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) || - __get_user(set.sig[1], &frame->extramask[0])) - goto badframe; - - set_current_blocked(&set); - - /* - * x86_32 has no uc_flags bits relevant to restore_sigcontext. - * Save a few cycles by skipping the __get_user. - */ - if (!restore_sigcontext(regs, &frame->sc, 0)) - goto badframe; - return regs->ax; - -badframe: - signal_fault(regs, frame, "sigreturn"); - - return 0; -} -#endif /* CONFIG_X86_32 */ - -SYSCALL_DEFINE0(rt_sigreturn) -{ - struct pt_regs *regs = current_pt_regs(); - struct rt_sigframe __user *frame; - sigset_t set; - unsigned long uc_flags; - - frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) - goto badframe; - if (__get_user(uc_flags, &frame->uc.uc_flags)) - goto badframe; - - set_current_blocked(&set); - - if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) - goto badframe; - - if (restore_altstack(&frame->uc.uc_stack)) - goto badframe; - - return regs->ax; - -badframe: - signal_fault(regs, frame, "rt_sigreturn"); - return 0; -} - /* * There are four different struct types for signal frame: sigframe_ia32, * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case @@ -743,43 +201,22 @@ unsigned long get_sigframe_size(void) return max_frame_size; } -static inline int is_ia32_compat_frame(struct ksignal *ksig) -{ - return IS_ENABLED(CONFIG_IA32_EMULATION) && - ksig->ka.sa.sa_flags & SA_IA32_ABI; -} - -static inline int is_ia32_frame(struct ksignal *ksig) -{ - return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); -} - -static inline int is_x32_frame(struct ksignal *ksig) -{ - return IS_ENABLED(CONFIG_X86_X32_ABI) && - ksig->ka.sa.sa_flags & SA_X32_ABI; -} - static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { - int usig = ksig->sig; - sigset_t *set = sigmask_to_save(); - compat_sigset_t *cset = (compat_sigset_t *) set; - /* Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ if (is_ia32_frame(ksig)) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) - return ia32_setup_rt_frame(usig, ksig, cset, regs); + return ia32_setup_rt_frame(ksig, regs); else - return ia32_setup_frame(usig, ksig, cset, regs); + return ia32_setup_frame(ksig, regs); } else if (is_x32_frame(ksig)) { - return x32_setup_rt_frame(ksig, cset, regs); + return x32_setup_rt_frame(ksig, regs); } else { - return __setup_rt_frame(ksig->sig, ksig, set, regs); + return x64_setup_rt_frame(ksig, regs); } } @@ -969,36 +406,3 @@ bool sigaltstack_size_valid(size_t ss_size) return true; } #endif /* CONFIG_DYNAMIC_SIGFRAME */ - -#ifdef CONFIG_X86_X32_ABI -COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) -{ - struct pt_regs *regs = current_pt_regs(); - struct rt_sigframe_x32 __user *frame; - sigset_t set; - unsigned long uc_flags; - - frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); - - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) - goto badframe; - if (__get_user(uc_flags, &frame->uc.uc_flags)) - goto badframe; - - set_current_blocked(&set); - - if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) - goto badframe; - - if (compat_restore_altstack(&frame->uc.uc_stack)) - goto badframe; - - return regs->ax; - -badframe: - signal_fault(regs, frame, "x32 rt_sigreturn"); - return 0; -} -#endif diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c new file mode 100644 index 000000000000..2553136cf39b --- /dev/null +++ b/arch/x86/kernel/signal_32.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen + */ + +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/wait.h> +#include <linux/unistd.h> +#include <linux/stddef.h> +#include <linux/personality.h> +#include <linux/compat.h> +#include <linux/binfmts.h> +#include <linux/syscalls.h> +#include <asm/ucontext.h> +#include <linux/uaccess.h> +#include <asm/fpu/signal.h> +#include <asm/ptrace.h> +#include <asm/user32.h> +#include <uapi/asm/sigcontext.h> +#include <asm/proto.h> +#include <asm/vdso.h> +#include <asm/sigframe.h> +#include <asm/sighandling.h> +#include <asm/smap.h> + +#ifdef CONFIG_IA32_EMULATION +#include <asm/ia32_unistd.h> + +static inline void reload_segments(struct sigcontext_32 *sc) +{ + unsigned int cur; + + savesegment(gs, cur); + if ((sc->gs | 0x03) != cur) + load_gs_index(sc->gs | 0x03); + savesegment(fs, cur); + if ((sc->fs | 0x03) != cur) + loadsegment(fs, sc->fs | 0x03); + savesegment(ds, cur); + if ((sc->ds | 0x03) != cur) + loadsegment(ds, sc->ds | 0x03); + savesegment(es, cur); + if ((sc->es | 0x03) != cur) + loadsegment(es, sc->es | 0x03); +} + +#define sigset32_t compat_sigset_t +#define restore_altstack32 compat_restore_altstack +#define unsafe_save_altstack32 unsafe_compat_save_altstack + +#else + +#define sigset32_t sigset_t +#define __NR_ia32_sigreturn __NR_sigreturn +#define __NR_ia32_rt_sigreturn __NR_rt_sigreturn +#define restore_altstack32 restore_altstack +#define unsafe_save_altstack32 unsafe_save_altstack +#define __copy_siginfo_to_user32 copy_siginfo_to_user + +#endif + +/* + * Do a signal return; undo the signal stack. + */ +static bool ia32_restore_sigcontext(struct pt_regs *regs, + struct sigcontext_32 __user *usc) +{ + struct sigcontext_32 sc; + + /* Always make any pending restarted system calls return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + if (unlikely(copy_from_user(&sc, usc, sizeof(sc)))) + return false; + + /* Get only the ia32 registers. */ + regs->bx = sc.bx; + regs->cx = sc.cx; + regs->dx = sc.dx; + regs->si = sc.si; + regs->di = sc.di; + regs->bp = sc.bp; + regs->ax = sc.ax; + regs->sp = sc.sp; + regs->ip = sc.ip; + + /* Get CS/SS and force CPL3 */ + regs->cs = sc.cs | 0x03; + regs->ss = sc.ss | 0x03; + + regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; + +#ifdef CONFIG_IA32_EMULATION + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ + reload_segments(&sc); +#else + loadsegment(gs, sc.gs); + regs->fs = sc.fs; + regs->es = sc.es; + regs->ds = sc.ds; +#endif + + return fpu__restore_sig(compat_ptr(sc.fpstate), 1); +} + +SYSCALL32_DEFINE0(sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); + sigset_t set; + + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) + || __get_user(((__u32 *)&set)[1], &frame->extramask[0])) + goto badframe; + + set_current_blocked(&set); + + if (!ia32_restore_sigcontext(regs, &frame->sc)) + goto badframe; + return regs->ax; + +badframe: + signal_fault(regs, frame, "32bit sigreturn"); + return 0; +} + +SYSCALL32_DEFINE0(rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe_ia32 __user *frame; + sigset_t set; + + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); + + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + + set_current_blocked(&set); + + if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) + goto badframe; + + if (restore_altstack32(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "32bit rt sigreturn"); + return 0; +} + +/* + * Set up a signal frame. + */ + +#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; }) + +static __always_inline int +__unsafe_setup_sigcontext32(struct sigcontext_32 __user *sc, + void __user *fpstate, + struct pt_regs *regs, unsigned int mask) +{ + unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault); +#ifdef CONFIG_IA32_EMULATION + unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault); + unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault); + unsafe_put_user(get_user_seg(es), (unsigned int __user *)&sc->es, Efault); +#else + unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); + unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); + unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); +#endif + + unsafe_put_user(regs->di, &sc->di, Efault); + unsafe_put_user(regs->si, &sc->si, Efault); + unsafe_put_user(regs->bp, &sc->bp, Efault); + unsafe_put_user(regs->sp, &sc->sp, Efault); + unsafe_put_user(regs->bx, &sc->bx, Efault); + unsafe_put_user(regs->dx, &sc->dx, Efault); + unsafe_put_user(regs->cx, &sc->cx, Efault); + unsafe_put_user(regs->ax, &sc->ax, Efault); + unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); + unsafe_put_user(current->thread.error_code, &sc->err, Efault); + unsafe_put_user(regs->ip, &sc->ip, Efault); + unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault); + unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault); + + unsafe_put_user(ptr_to_compat(fpstate), &sc->fpstate, Efault); + + /* non-iBCS2 extensions.. */ + unsafe_put_user(mask, &sc->oldmask, Efault); + unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); + return 0; + +Efault: + return -EFAULT; +} + +#define unsafe_put_sigcontext32(sc, fp, regs, set, label) \ +do { \ + if (__unsafe_setup_sigcontext32(sc, fp, regs, set->sig[0])) \ + goto label; \ +} while(0) + +int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + sigset32_t *set = (sigset32_t *) sigmask_to_save(); + struct sigframe_ia32 __user *frame; + void __user *restorer; + void __user *fp = NULL; + + /* copy_to_user optimizes that into a single 8 byte store */ + static const struct { + u16 poplmovl; + u32 val; + u16 int80; + } __attribute__((packed)) code = { + 0xb858, /* popl %eax ; movl $...,%eax */ + __NR_ia32_sigreturn, + 0x80cd, /* int $0x80 */ + }; + + frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); + + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = ksig->ka.sa.sa_restorer; + } else { + /* Return stub is in 32bit vsyscall page */ + if (current->mm->context.vdso) + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_sigreturn; + else + restorer = &frame->retcode; + } + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + unsafe_put_user(ksig->sig, &frame->sig, Efault); + unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault); + unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); + unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault); + /* + * These are actually not used anymore, but left because some + * gdb versions depend on them as a marker. + */ + unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault); + user_access_end(); + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* Make -mregparm=3 work */ + regs->ax = ksig->sig; + regs->dx = 0; + regs->cx = 0; + +#ifdef CONFIG_IA32_EMULATION + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#else + regs->ds = __USER_DS; + regs->es = __USER_DS; +#endif + + regs->cs = __USER32_CS; + regs->ss = __USER_DS; + + return 0; +Efault: + user_access_end(); + return -EFAULT; +} + +int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + sigset32_t *set = (sigset32_t *) sigmask_to_save(); + struct rt_sigframe_ia32 __user *frame; + void __user *restorer; + void __user *fp = NULL; + + /* unsafe_put_user optimizes that into a single 8 byte store */ + static const struct { + u8 movl; + u32 val; + u16 int80; + u8 pad; + } __attribute__((packed)) code = { + 0xb8, + __NR_ia32_rt_sigreturn, + 0x80cd, + 0, + }; + + frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + unsafe_put_user(ksig->sig, &frame->sig, Efault); + unsafe_put_user(ptr_to_compat(&frame->info), &frame->pinfo, Efault); + unsafe_put_user(ptr_to_compat(&frame->uc), &frame->puc, Efault); + + /* Create the ucontext. */ + if (static_cpu_has(X86_FEATURE_XSAVE)) + unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault); + else + unsafe_put_user(0, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack32(&frame->uc.uc_stack, regs->sp, Efault); + + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + else + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_rt_sigreturn; + unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault); + + /* + * Not actually used anymore, but left because some gdb + * versions need it. + */ + unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault); + unsafe_put_sigcontext32(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask, Efault); + user_access_end(); + + if (__copy_siginfo_to_user32(&frame->info, &ksig->info)) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* Make -mregparm=3 work */ + regs->ax = ksig->sig; + regs->dx = (unsigned long) &frame->info; + regs->cx = (unsigned long) &frame->uc; + +#ifdef CONFIG_IA32_EMULATION + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#else + regs->ds = __USER_DS; + regs->es = __USER_DS; +#endif + + regs->cs = __USER32_CS; + regs->ss = __USER_DS; + + return 0; +Efault: + user_access_end(); + return -EFAULT; +} diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c new file mode 100644 index 000000000000..ff9c55064223 --- /dev/null +++ b/arch/x86/kernel/signal_64.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/unistd.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> + +#include <asm/ucontext.h> +#include <asm/fpu/signal.h> +#include <asm/sighandling.h> + +#include <asm/syscall.h> +#include <asm/sigframe.h> +#include <asm/signal.h> + +/* + * If regs->ss will cause an IRET fault, change it. Otherwise leave it + * alone. Using this generally makes no sense unless + * user_64bit_mode(regs) would return true. + */ +static void force_valid_ss(struct pt_regs *regs) +{ + u32 ar; + asm volatile ("lar %[old_ss], %[ar]\n\t" + "jz 1f\n\t" /* If invalid: */ + "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ + "1:" + : [ar] "=r" (ar) + : [old_ss] "rm" ((u16)regs->ss)); + + /* + * For a valid 64-bit user context, we need DPL 3, type + * read-write data or read-write exp-down data, and S and P + * set. We can't use VERW because VERW doesn't check the + * P bit. + */ + ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; + if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && + ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) + regs->ss = __USER_DS; +} + +static bool restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *usc, + unsigned long uc_flags) +{ + struct sigcontext sc; + + /* Always make any pending restarted system calls return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1))) + return false; + + regs->bx = sc.bx; + regs->cx = sc.cx; + regs->dx = sc.dx; + regs->si = sc.si; + regs->di = sc.di; + regs->bp = sc.bp; + regs->ax = sc.ax; + regs->sp = sc.sp; + regs->ip = sc.ip; + regs->r8 = sc.r8; + regs->r9 = sc.r9; + regs->r10 = sc.r10; + regs->r11 = sc.r11; + regs->r12 = sc.r12; + regs->r13 = sc.r13; + regs->r14 = sc.r14; + regs->r15 = sc.r15; + + /* Get CS/SS and force CPL3 */ + regs->cs = sc.cs | 0x03; + regs->ss = sc.ss | 0x03; + + regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; + + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) + force_valid_ss(regs); + + return fpu__restore_sig((void __user *)sc.fpstate, 0); +} + +static __always_inline int +__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask) +{ + unsafe_put_user(regs->di, &sc->di, Efault); + unsafe_put_user(regs->si, &sc->si, Efault); + unsafe_put_user(regs->bp, &sc->bp, Efault); + unsafe_put_user(regs->sp, &sc->sp, Efault); + unsafe_put_user(regs->bx, &sc->bx, Efault); + unsafe_put_user(regs->dx, &sc->dx, Efault); + unsafe_put_user(regs->cx, &sc->cx, Efault); + unsafe_put_user(regs->ax, &sc->ax, Efault); + unsafe_put_user(regs->r8, &sc->r8, Efault); + unsafe_put_user(regs->r9, &sc->r9, Efault); + unsafe_put_user(regs->r10, &sc->r10, Efault); + unsafe_put_user(regs->r11, &sc->r11, Efault); + unsafe_put_user(regs->r12, &sc->r12, Efault); + unsafe_put_user(regs->r13, &sc->r13, Efault); + unsafe_put_user(regs->r14, &sc->r14, Efault); + unsafe_put_user(regs->r15, &sc->r15, Efault); + + unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); + unsafe_put_user(current->thread.error_code, &sc->err, Efault); + unsafe_put_user(regs->ip, &sc->ip, Efault); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->cs, &sc->cs, Efault); + unsafe_put_user(0, &sc->gs, Efault); + unsafe_put_user(0, &sc->fs, Efault); + unsafe_put_user(regs->ss, &sc->ss, Efault); + + unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); + + /* non-iBCS2 extensions.. */ + unsafe_put_user(mask, &sc->oldmask, Efault); + unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); + return 0; +Efault: + return -EFAULT; +} + +#define unsafe_put_sigcontext(sc, fp, regs, set, label) \ +do { \ + if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ + goto label; \ +} while(0); + +#define unsafe_put_sigmask(set, frame, label) \ + unsafe_put_user(*(__u64 *)(set), \ + (__u64 __user *)&(frame)->uc.uc_sigmask, \ + label) + +static unsigned long frame_uc_flags(struct pt_regs *regs) +{ + unsigned long flags; + + if (boot_cpu_has(X86_FEATURE_XSAVE)) + flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; + else + flags = UC_SIGCONTEXT_SS; + + if (likely(user_64bit_mode(regs))) + flags |= UC_STRICT_RESTORE_SS; + + return flags; +} + +int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + sigset_t *set = sigmask_to_save(); + struct rt_sigframe __user *frame; + void __user *fp = NULL; + unsigned long uc_flags; + + /* x86-64 should always use SA_RESTORER. */ + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; + + frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); + uc_flags = frame_uc_flags(regs); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } + + /* Set up registers for signal handler */ + regs->di = ksig->sig; + /* In case the signal handler was declared without prototypes */ + regs->ax = 0; + + /* This also works for non SA_SIGINFO handlers because they expect the + next argument after the signal number on the stack. */ + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + regs->sp = (unsigned long)frame; + + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. We also have a compatibility issue here: DOSEMU + * relies on the contents of the SS register indicating the + * SS value at the time of the signal, even though that code in + * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU + * avoids relying on sigreturn to restore SS; instead it uses + * a trampoline.) So we do our best: if the old SS was valid, + * we keep it. Otherwise we replace it. + */ + regs->cs = __USER_CS; + + if (unlikely(regs->ss != __USER_DS)) + force_valid_ss(regs); + + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +/* + * Do a signal return; undo the signal stack. + */ +SYSCALL_DEFINE0(rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe __user *frame; + sigset_t set; + unsigned long uc_flags; + + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; + + set_current_blocked(&set); + + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + goto badframe; + + if (restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "rt_sigreturn"); + return 0; +} + +#ifdef CONFIG_X86_X32_ABI +static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); + if (from->si_signo == SIGCHLD) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + return 0; +} + +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + if (in_x32_syscall()) + return x32_copy_siginfo_to_user(to, from); + return __copy_siginfo_to_user32(to, from); +} + +int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save(); + struct rt_sigframe_x32 __user *frame; + unsigned long uc_flags; + void __user *restorer; + void __user *fp = NULL; + + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; + + frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); + + uc_flags = frame_uc_flags(regs); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + unsafe_put_user(0, &frame->uc.uc__pad0, Efault); + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* We use the x32 calling convention here... */ + regs->di = ksig->sig; + regs->si = (unsigned long) &frame->info; + regs->dx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + regs->cs = __USER_CS; + regs->ss = __USER_DS; + + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe_x32 __user *frame; + sigset_t set; + unsigned long uc_flags; + + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); + + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; + + set_current_blocked(&set); + + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + goto badframe; + + if (compat_restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "x32 rt_sigreturn"); + return 0; +} +#endif /* CONFIG_X86_X32_ABI */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f24227bc3220..55cad72715d9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,8 +56,10 @@ #include <linux/numa.h> #include <linux/pgtable.h> #include <linux/overflow.h> +#include <linux/stackprotector.h> #include <asm/acpi.h> +#include <asm/cacheinfo.h> #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> @@ -1046,7 +1048,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); - per_cpu(current_task, cpu) = idle; + per_cpu(pcpu_hot.current_task, cpu) = idle; cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ @@ -1056,7 +1058,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); + per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif @@ -1316,7 +1318,7 @@ static void __init smp_sanity_check(void) nr++; } - nr_cpu_ids = 8; + set_nr_cpu_ids(8); } #endif @@ -1428,8 +1430,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) uv_system_init(); - set_mtrr_aps_delayed_init(); - smp_quirk_init_udelay(); speculative_store_bypass_ht_init(); @@ -1439,12 +1439,12 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) void arch_thaw_secondary_cpus_begin(void) { - set_mtrr_aps_delayed_init(); + set_cache_aps_delayed_init(true); } void arch_thaw_secondary_cpus_end(void) { - mtrr_aps_init(); + cache_aps_init(); } /* @@ -1453,7 +1453,11 @@ void arch_thaw_secondary_cpus_end(void) void __init native_smp_prepare_boot_cpu(void) { int me = smp_processor_id(); - switch_to_new_gdt(me); + + /* SMP handles this from setup_per_cpu_areas() */ + if (!IS_ENABLED(CONFIG_SMP)) + switch_gdt_and_percpu_base(me); + /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); cpu_set_state_online(me); @@ -1487,7 +1491,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) nmi_selftest(); impress_friends(); - mtrr_aps_init(); + cache_aps_init(); } static int __initdata setup_possible_cpus = -1; @@ -1569,7 +1573,7 @@ __init void prefill_possible_map(void) possible = i; } - nr_cpu_ids = possible; + set_nr_cpu_ids(possible); pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index aaaba85d6d7f..2ebc338980bc 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -34,6 +34,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, switch (type) { case CALL: + func = callthunks_translate_call_dest(func); code = text_gen_insn(CALL_INSN_OPCODE, insn, func); if (func == &__static_call_return0) { emulate = code; @@ -52,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, case RET: if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - code = text_gen_insn(JMP32_INSN_OPCODE, insn, &__x86_return_thunk); + code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); else code = &retinsn; break; diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 3bacd935f840..4c1bcb6053fc 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -95,7 +95,7 @@ void __init tboot_probe(void) static pgd_t *tboot_pg_dir; static struct mm_struct tboot_mm = { - .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 8617d1ed9d31..1b83377274b8 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -106,7 +106,7 @@ int arch_register_cpu(int num) * Xen PV guests don't support CPU0 hotplug at all. */ if (c->x86_vendor != X86_VENDOR_INTEL || - boot_cpu_has(X86_FEATURE_XENPV)) + cpu_feature_enabled(X86_FEATURE_XENPV)) cpu0_hotpluggable = 0; /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d62b2cb85cea..d317dc3d06a3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -15,6 +15,7 @@ #include <linux/context_tracking.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> +#include <linux/kmsan.h> #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> @@ -63,16 +64,17 @@ #include <asm/insn-eval.h> #include <asm/vdso.h> #include <asm/tdx.h> +#include <asm/cfi.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> -#include <asm/proto.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> -#include <asm/proto.h> #endif +#include <asm/proto.h> + DECLARE_BITMAP(system_vectors, NR_VECTORS); static inline void cond_local_irq_enable(struct pt_regs *regs) @@ -300,6 +302,12 @@ static noinstr bool handle_bug(struct pt_regs *regs) { bool handled = false; + /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); if (!is_valid_bugaddr(regs->ip)) return handled; @@ -313,7 +321,8 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || + handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { regs->ip += LEN_UD2; handled = true; } @@ -849,7 +858,7 @@ DEFINE_IDTENTRY_RAW(exc_int3) */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; + struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1; if (regs != eregs) *regs = *eregs; return regs; @@ -867,7 +876,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { - sp = this_cpu_read(cpu_current_top_of_stack); + sp = this_cpu_read(pcpu_hot.top_of_stack); goto sync; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cafacb2e58cc..a78e73da4a74 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -51,7 +51,7 @@ int tsc_clocksource_reliable; static u32 art_to_tsc_numerator; static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; -struct clocksource *art_related_clocksource; +static struct clocksource *art_related_clocksource; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 8e1c50c86e5d..d8ba93778ae3 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -183,6 +183,16 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) } #endif +/* + * While walking the stack, KMSAN may stomp on stale locals from other + * functions that were marked as uninitialized upon function exit, and + * now hold the call frame information for the current function (e.g. the frame + * pointer). Because KMSAN does not specifically mark call frames as + * initialized, false positive reports are possible. To prevent such reports, + * we mark the functions scanning the stack (here and below) with + * __no_kmsan_checks. + */ +__no_kmsan_checks static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) { @@ -250,6 +260,7 @@ static bool update_stack_state(struct unwind_state *state, return true; } +__no_kmsan_checks bool unwind_next_frame(struct unwind_state *state) { struct pt_regs *regs; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 0ea57da92940..cdf6c6060170 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -136,6 +136,21 @@ static struct orc_entry null_orc_entry = { .type = UNWIND_HINT_TYPE_CALL }; +#ifdef CONFIG_CALL_THUNKS +static struct orc_entry *orc_callthunk_find(unsigned long ip) +{ + if (!is_callthunk((void *)ip)) + return NULL; + + return &null_orc_entry; +} +#else +static struct orc_entry *orc_callthunk_find(unsigned long ip) +{ + return NULL; +} +#endif + /* Fake frame pointer entry -- used as a fallback for generated code */ static struct orc_entry orc_fp_entry = { .type = UNWIND_HINT_TYPE_CALL, @@ -189,7 +204,11 @@ static struct orc_entry *orc_find(unsigned long ip) if (orc) return orc; - return orc_ftrace_find(ip); + orc = orc_ftrace_find(ip); + if (orc) + return orc; + + return orc_callthunk_find(ip); } #ifdef CONFIG_MODULES @@ -713,7 +732,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, /* Otherwise, skip ahead to the user-specified starting frame: */ while (!unwind_done(state) && (!on_stack(&state->stack_info, first_frame, sizeof(long)) || - state->sp < (unsigned long)first_frame)) + state->sp <= (unsigned long)first_frame)) unwind_next_frame(state); return; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index b63cf8f7745e..6c07f6daaa22 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -722,8 +722,9 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ break; + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); @@ -753,6 +754,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } +setup: auprobe->branch.opc1 = opc1; auprobe->branch.ilen = insn->length; auprobe->branch.offs = insn->immediate.value; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 15f29053cec4..2e0ee14229bf 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -132,18 +132,19 @@ SECTIONS CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT - ALIGN_ENTRY_TEXT_BEGIN - ENTRY_TEXT - ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT - STATIC_CALL_TEXT - *(.gnu.warning) - #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; *(.text.__x86.*) __indirect_thunk_end = .; #endif + STATIC_CALL_TEXT + + ALIGN_ENTRY_TEXT_BEGIN + ENTRY_TEXT + ALIGN_ENTRY_TEXT_END + *(.gnu.warning) + } :text =0xcccc /* End of text section, which should occupy whole number of pages */ @@ -290,6 +291,13 @@ SECTIONS *(.return_sites) __return_sites_end = .; } + + . = ALIGN(8); + .call_sites : AT(ADDR(.call_sites) - LOAD_OFFSET) { + __call_sites = .; + *(.call_sites) + __call_sites_end = .; + } #endif #ifdef CONFIG_X86_KERNEL_IBT @@ -301,6 +309,15 @@ SECTIONS } #endif +#ifdef CONFIG_FINEIBT + . = ALIGN(8); + .cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) { + __cfi_sites = .; + *(.cfi_sites) + __cfi_sites_end = .; + } +#endif + /* * struct alt_inst entries. From the header (alternative.h): * "Alternative instructions for different CPU types or capabilities" @@ -493,11 +510,3 @@ INIT_PER_CPU(irq_stack_backing_store); #endif #endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_KEXEC_CORE -#include <asm/kexec.h> - -. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big"); -#endif - diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index e84ee5cdbd8c..ef80d361b463 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -25,6 +25,7 @@ #include <asm/iommu.h> #include <asm/mach_traps.h> #include <asm/irqdomain.h> +#include <asm/realmode.h> void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -138,13 +139,15 @@ struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, - .set_wallclock = mach_set_rtc_mmss, + .set_wallclock = mach_set_cmos_time, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, .save_sched_clock_state = tsc_save_sched_clock_state, .restore_sched_clock_state = tsc_restore_sched_clock_state, + .realmode_reserve = reserve_real_mode, + .realmode_init = init_real_mode, .hyper.pin_vcpu = x86_op_int_noop, .guest = { |