aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c41
-rw-r--r--arch/x86/kernel/alternative.c259
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/amd_nb.c12
-rw-r--r--arch/x86/kernel/aperture_64.c3
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c31
-rw-r--r--arch/x86/kernel/apic/apic_common.c4
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c12
-rw-r--r--arch/x86/kernel/apic/apic_noop.c9
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c18
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c7
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/ipi.c13
-rw-r--r--arch/x86/kernel/apic/local.h7
-rw-r--r--arch/x86/kernel/apic/msi.c8
-rw-r--r--arch/x86/kernel/apic/probe_32.c13
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c20
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets.c33
-rw-r--r--arch/x86/kernel/callthunks.c23
-rw-r--r--arch/x86/kernel/cfi.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c356
-rw-r--r--arch/x86/kernel/cpu/bugs.c104
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c49
-rw-r--r--arch/x86/kernel/cpu/common.c195
-rw-r--r--arch/x86/kernel/cpu/cpu.h3
-rw-r--r--arch/x86/kernel/cpu/debugfs.c58
-rw-r--r--arch/x86/kernel/cpu/hygon.c51
-rw-r--r--arch/x86/kernel/cpu/intel.c14
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c2
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c148
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c4
-rw-r--r--arch/x86/kernel/cpu/mce/core.c123
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c1
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c324
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h70
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c115
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c188
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c683
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c679
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h49
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c5
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c14
-rw-r--r--arch/x86/kernel/cpu/proc.c8
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c11
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c14
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h31
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c10
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c281
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c30
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c3
-rw-r--r--arch/x86/kernel/cpu/topology.c13
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c14
-rw-r--r--arch/x86/kernel/crash.c47
-rw-r--r--arch/x86/kernel/devicetree.c6
-rw-r--r--arch/x86/kernel/early-quirks.c4
-rw-r--r--arch/x86/kernel/fpu/bugs.c1
-rw-r--r--arch/x86/kernel/fpu/core.c7
-rw-r--r--arch/x86/kernel/fpu/signal.c13
-rw-r--r--arch/x86/kernel/fpu/xstate.c13
-rw-r--r--arch/x86/kernel/fpu/xstate.h3
-rw-r--r--arch/x86/kernel/ftrace_32.S2
-rw-r--r--arch/x86/kernel/ftrace_64.S2
-rw-r--r--arch/x86/kernel/head32.c120
-rw-r--r--arch/x86/kernel/head64.c23
-rw-r--r--arch/x86/kernel/head_32.S12
-rw-r--r--arch/x86/kernel/head_64.S60
-rw-r--r--arch/x86/kernel/hpet.c10
-rw-r--r--arch/x86/kernel/i8259.c38
-rw-r--r--arch/x86/kernel/idt.c9
-rw-r--r--arch/x86/kernel/irqflags.S2
-rw-r--r--arch/x86/kernel/itmt.c1
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c23
-rw-r--r--arch/x86/kernel/kgdb.c1
-rw-r--r--arch/x86/kernel/kprobes/core.c3
-rw-r--r--arch/x86/kernel/kvm.c15
-rw-r--r--arch/x86/kernel/kvmclock.c14
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_64.c7
-rw-r--r--arch/x86/kernel/module.c20
-rw-r--r--arch/x86/kernel/nmi.c22
-rw-r--r--arch/x86/kernel/paravirt.c121
-rw-r--r--arch/x86/kernel/process.c9
-rw-r--r--arch/x86/kernel/reboot.c66
-rw-r--r--arch/x86/kernel/rtc.c2
-rw-r--r--arch/x86/kernel/setup.c167
-rw-r--r--arch/x86/kernel/setup_percpu.c4
-rw-r--r--arch/x86/kernel/sev-shared.c124
-rw-r--r--arch/x86/kernel/sev.c46
-rw-r--r--arch/x86/kernel/shstk.c33
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/signal_64.c6
-rw-r--r--arch/x86/kernel/smp.c39
-rw-r--r--arch/x86/kernel/smpboot.c151
-rw-r--r--arch/x86/kernel/topology.c33
-rw-r--r--arch/x86/kernel/traps.c5
-rw-r--r--arch/x86/kernel/tsc_sync.c10
-rw-r--r--arch/x86/kernel/unwind_orc.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S22
-rw-r--r--arch/x86/kernel/vsmp_64.c2
107 files changed, 2950 insertions, 2571 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3269a0e23d3a..0000325ab98f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,6 +16,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
+CFLAGS_REMOVE_head32.o = -pg
CFLAGS_REMOVE_sev.o = -pg
CFLAGS_REMOVE_rethook.o = -pg
endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2a0ea38955df..85a3ce2a3666 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -63,6 +63,7 @@ int acpi_fix_pin2_polarity __initdata;
#ifdef CONFIG_X86_LOCAL_APIC
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+static bool has_lapic_cpus __initdata;
static bool acpi_support_online_capable;
#endif
@@ -148,6 +149,9 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
pr_debug("Local APIC address 0x%08x\n", madt->address);
}
+ if (madt->flags & ACPI_MADT_PCAT_COMPAT)
+ legacy_pic_pcat_compat();
+
/* ACPI 6.3 and newer support the online capable bit. */
if (acpi_gbl_FADT.header.revision > 6 ||
(acpi_gbl_FADT.header.revision == 6 &&
@@ -230,6 +234,14 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
return 0;
/*
+ * According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure
+ * when MADT provides both valid LAPIC and x2APIC entries, the APIC ID
+ * in x2APIC must be equal or greater than 0xff.
+ */
+ if (has_lapic_cpus && apic_id < 0xff)
+ return 0;
+
+ /*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
* cpus_possible_map more accurately, to permit
@@ -281,6 +293,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
processor->processor_id, /* ACPI ID */
processor->lapic_flags & ACPI_MADT_ENABLED);
+ has_lapic_cpus = true;
return 0;
}
@@ -359,7 +372,7 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
}
#ifdef CONFIG_X86_64
-static int acpi_wakeup_cpu(int apicid, unsigned long start_ip)
+static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
{
/*
* Remap mailbox memory only for the first call to acpi_wakeup_cpu().
@@ -856,7 +869,7 @@ int acpi_unmap_cpu(int cpu)
set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
#endif
- per_cpu(x86_cpu_to_apicid, cpu) = -1;
+ per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
set_cpu_present(cpu, false);
num_processors--;
@@ -1111,10 +1124,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
static int __init acpi_parse_madt_lapic_entries(void)
{
- int count;
- int x2count = 0;
- int ret;
- struct acpi_subtable_proc madt_proc[2];
+ int count, x2count = 0;
if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
@@ -1123,21 +1133,10 @@ static int __init acpi_parse_madt_lapic_entries(void)
acpi_parse_sapic, MAX_LOCAL_APIC);
if (!count) {
- memset(madt_proc, 0, sizeof(madt_proc));
- madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
- madt_proc[0].handler = acpi_parse_lapic;
- madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
- madt_proc[1].handler = acpi_parse_x2apic;
- ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
- sizeof(struct acpi_table_madt),
- madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
- if (ret < 0) {
- pr_err("Error parsing LAPIC/X2APIC entries\n");
- return ret;
- }
-
- count = madt_proc[0].count;
- x2count = madt_proc[1].count;
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
+ acpi_parse_lapic, MAX_LOCAL_APIC);
+ x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
+ acpi_parse_x2apic, MAX_LOCAL_APIC);
}
if (!count && !x2count) {
pr_err("No LAPIC entries present\n");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1781e020f393..e7aeae02aaca 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -30,6 +30,7 @@
#include <asm/fixmap.h>
#include <asm/paravirt.h>
#include <asm/asm-prototypes.h>
+#include <asm/cfi.h>
int __read_mostly alternatives_patched;
@@ -160,7 +161,6 @@ extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern s32 __return_sites[], __return_sites_end[];
extern s32 __cfi_sites[], __cfi_sites_end[];
extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);
@@ -255,6 +255,16 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
}
}
+static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ optimize_nops(instr, len);
+ sync_core();
+ local_irq_restore(flags);
+}
+
/*
* In this context, "source" is where the instructions are placed in the
* section .altinstr_replacement, for example during kernel build by the
@@ -384,6 +394,63 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
}
}
+/* Low-level backend functions usable from alternative code replacements. */
+DEFINE_ASM_FUNC(nop_func, "", .entry.text);
+EXPORT_SYMBOL_GPL(nop_func);
+
+noinstr void BUG_func(void)
+{
+ BUG();
+}
+EXPORT_SYMBOL(BUG_func);
+
+#define CALL_RIP_REL_OPCODE 0xff
+#define CALL_RIP_REL_MODRM 0x15
+
+/*
+ * Rewrite the "call BUG_func" replacement to point to the target of the
+ * indirect pv_ops call "call *disp(%ip)".
+ */
+static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
+{
+ void *target, *bug = &BUG_func;
+ s32 disp;
+
+ if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
+ BUG();
+ }
+
+ if (a->instrlen != 6 ||
+ instr[0] != CALL_RIP_REL_OPCODE ||
+ instr[1] != CALL_RIP_REL_MODRM) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
+ BUG();
+ }
+
+ /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
+ disp = *(s32 *)(instr + 2);
+#ifdef CONFIG_X86_64
+ /* ff 15 00 00 00 00 call *0x0(%rip) */
+ /* target address is stored at "next instruction + disp". */
+ target = *(void **)(instr + a->instrlen + disp);
+#else
+ /* ff 15 00 00 00 00 call *0x0 */
+ /* target address is stored at disp. */
+ target = *(void **)disp;
+#endif
+ if (!target)
+ target = bug;
+
+ /* (BUG_func - .) + (target - BUG_func) := target - . */
+ *(s32 *)(insn_buff + 1) += target - bug;
+
+ if (target == &nop_func)
+ return 0;
+
+ return 5;
+}
+
/*
* Replace instructions with better alternatives for this CPU type. This runs
* before SMP is initialized to avoid SMP problems with self modifying code.
@@ -402,6 +469,17 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
u8 insn_buff[MAX_PATCH_LEN];
DPRINTK(ALT, "alt table %px, -> %px", start, end);
+
+ /*
+ * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using
+ * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
+ * During the process, KASAN becomes confused seeing partial LA57
+ * conversion and triggers a false-positive out-of-bound report.
+ *
+ * Disable KASAN until the patching is complete.
+ */
+ kasan_disable_current();
+
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite previously scanned alternative code.
@@ -426,20 +504,25 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* patch if feature is *NOT* present.
*/
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
- optimize_nops(instr, a->instrlen);
+ optimize_nops_inplace(instr, a->instrlen);
continue;
}
- DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
- (a->flags & ALT_FLAG_NOT) ? "!" : "",
+ DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
a->cpuid >> 5,
a->cpuid & 0x1f,
instr, instr, a->instrlen,
- replacement, a->replacementlen);
+ replacement, a->replacementlen, a->flags);
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
+ if (a->flags & ALT_FLAG_DIRECT_CALL) {
+ insn_buff_sz = alt_replace_call(instr, insn_buff, a);
+ if (insn_buff_sz < 0)
+ continue;
+ }
+
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
@@ -451,6 +534,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
text_poke_early(instr, insn_buff, insn_buff_sz);
}
+
+ kasan_enable_current();
}
static inline bool is_jcc32(struct insn *insn)
@@ -719,13 +804,8 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
{
s32 *s;
- /*
- * Do not patch out the default return thunks if those needed are the
- * ones generated by the compiler.
- */
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK) &&
- (x86_return_thunk == __x86_return_thunk))
- return;
+ if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ static_call_force_reinit();
for (s = start; s < end; s++) {
void *dest = NULL, *addr = (void *)s + *s;
@@ -823,15 +903,82 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
#endif /* CONFIG_X86_KERNEL_IBT */
#ifdef CONFIG_FINEIBT
+#define __CFI_DEFAULT CFI_DEFAULT
+#elif defined(CONFIG_CFI_CLANG)
+#define __CFI_DEFAULT CFI_KCFI
+#else
+#define __CFI_DEFAULT CFI_OFF
+#endif
-enum cfi_mode {
- CFI_DEFAULT,
- CFI_OFF,
- CFI_KCFI,
- CFI_FINEIBT,
-};
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+
+#ifdef CONFIG_CFI_CLANG
+struct bpf_insn;
+
+/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
+extern unsigned int __bpf_prog_runX(const void *ctx,
+ const struct bpf_insn *insn);
+
+/*
+ * Force a reference to the external symbol so the compiler generates
+ * __kcfi_typid.
+ */
+__ADDRESSABLE(__bpf_prog_runX);
+
+/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
+asm (
+" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
+" .type cfi_bpf_hash,@object \n"
+" .globl cfi_bpf_hash \n"
+" .p2align 2, 0x0 \n"
+"cfi_bpf_hash: \n"
+" .long __kcfi_typeid___bpf_prog_runX \n"
+" .size cfi_bpf_hash, 4 \n"
+" .popsection \n"
+);
+
+/* Must match bpf_callback_t */
+extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
+
+__ADDRESSABLE(__bpf_callback_fn);
+
+/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
+asm (
+" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
+" .type cfi_bpf_subprog_hash,@object \n"
+" .globl cfi_bpf_subprog_hash \n"
+" .p2align 2, 0x0 \n"
+"cfi_bpf_subprog_hash: \n"
+" .long __kcfi_typeid___bpf_callback_fn \n"
+" .size cfi_bpf_subprog_hash, 4 \n"
+" .popsection \n"
+);
+
+u32 cfi_get_func_hash(void *func)
+{
+ u32 hash;
+
+ func -= cfi_get_offset();
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ func += 7;
+ break;
+ case CFI_KCFI:
+ func += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (get_kernel_nofault(hash, func))
+ return 0;
+
+ return hash;
+}
+#endif
+
+#ifdef CONFIG_FINEIBT
-static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
static bool cfi_rand __ro_after_init = true;
static u32 cfi_seed __ro_after_init;
@@ -1140,8 +1287,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
goto err;
if (cfi_rand) {
- if (builtin)
+ if (builtin) {
cfi_seed = get_random_u32();
+ cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+ cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
+ }
ret = cfi_rand_preamble(start_cfi, end_cfi);
if (ret)
@@ -1402,46 +1552,6 @@ int alternatives_text_reserved(void *start, void *end)
}
#endif /* CONFIG_SMP */
-#ifdef CONFIG_PARAVIRT
-
-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void __init_or_module add_nops(void *insns, unsigned int len)
-{
- while (len > 0) {
- unsigned int noplen = len;
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
- memcpy(insns, x86_nops[noplen], noplen);
- insns += noplen;
- len -= noplen;
- }
-}
-
-void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
-{
- struct paravirt_patch_site *p;
- char insn_buff[MAX_PATCH_LEN];
-
- for (p = start; p < end; p++) {
- unsigned int used;
-
- BUG_ON(p->len > MAX_PATCH_LEN);
- /* prep the buffer with the original instructions */
- memcpy(insn_buff, p->instr, p->len);
- used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
-
- BUG_ON(used > p->len);
-
- /* Pad the rest with nops */
- add_nops(insn_buff + used, p->len - used);
- text_poke_early(p->instr, insn_buff, p->len);
- }
-}
-extern struct paravirt_patch_site __start_parainstructions[],
- __stop_parainstructions[];
-#endif /* CONFIG_PARAVIRT */
-
/*
* Self-test for the INT3 based CALL emulation code.
*
@@ -1577,28 +1687,11 @@ void __init alternative_instructions(void)
*/
/*
- * Paravirt patching and alternative patching can be combined to
- * replace a function call with a short direct code sequence (e.g.
- * by setting a constant return value instead of doing that in an
- * external function).
- * In order to make this work the following sequence is required:
- * 1. set (artificial) features depending on used paravirt
- * functions which can later influence alternative patching
- * 2. apply paravirt patching (generally replacing an indirect
- * function call with a direct one)
- * 3. apply alternative patching (e.g. replacing a direct function
- * call with a custom code sequence)
- * Doing paravirt patching after alternative patching would clobber
- * the optimization of the custom code with a function call again.
+ * Make sure to set (artificial) features depending on used paravirt
+ * functions which can later influence alternative patching.
*/
paravirt_set_cap();
- /*
- * First patch paravirt functions, such that we overwrite the indirect
- * call with the direct call.
- */
- apply_paravirt(__parainstructions, __parainstructions_end);
-
__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
__cfi_sites, __cfi_sites_end, true);
@@ -1609,10 +1702,6 @@ void __init alternative_instructions(void)
apply_retpolines(__retpoline_sites, __retpoline_sites_end);
apply_returns(__return_sites, __return_sites_end);
- /*
- * Then patch alternatives, such that those paravirt calls that are in
- * alternatives can be overwritten by their immediate fragments.
- */
apply_alternatives(__alt_instructions, __alt_instructions_end);
/*
@@ -1676,8 +1765,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
} else {
local_irq_save(flags);
memcpy(addr, opcode, len);
- local_irq_restore(flags);
sync_core();
+ local_irq_restore(flags);
/*
* Could also do a CLFLUSH here to speed up CPU recovery; but
@@ -1887,7 +1976,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
* Note that the caller must ensure that if the modified code is part of a
* module, the module would not be removed during poking. This can be achieved
* by registering a module notifier, and ordering module removal and patching
- * trough a mutex.
+ * through a mutex.
*/
void *text_poke(void *addr, const void *opcode, size_t len)
{
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 56a917df410d..2ae98f754e59 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -776,7 +776,7 @@ int __init gart_iommu_init(void)
iommu_size >> PAGE_SHIFT);
/*
* Tricky. The GART table remaps the physical memory range,
- * so the CPU wont notice potential aliases and if the memory
+ * so the CPU won't notice potential aliases and if the memory
* is remapped to UC later on, we might surprise the PCI devices
* with a stray writeout of a cacheline. So play it sure and
* do an explicit, full-scale wbinvd() _after_ having marked all
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 356de955e78d..053f6dcc6b2c 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -27,6 +27,7 @@
#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a
#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507
#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb
+#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
@@ -43,6 +44,7 @@
#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc
#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4
#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4
+#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c
/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
@@ -62,6 +64,7 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) },
{}
};
@@ -93,6 +96,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) },
{}
};
@@ -112,9 +116,13 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) },
{}
};
@@ -386,7 +394,7 @@ int amd_get_subcaches(int cpu)
pci_read_config_dword(link, 0x1d4, &mask);
- return (mask >> (4 * cpu_data(cpu).cpu_core_id)) & 0xf;
+ return (mask >> (4 * cpu_data(cpu).topo.core_id)) & 0xf;
}
int amd_set_subcaches(int cpu, unsigned long mask)
@@ -412,7 +420,7 @@ int amd_set_subcaches(int cpu, unsigned long mask)
pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
}
- cuid = cpu_data(cpu).cpu_core_id;
+ cuid = cpu_data(cpu).topo.core_id;
mask <<= 4 * cuid;
mask |= (0xf ^ (1 << cuid)) << 26;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 4feaa670d578..89c0c8a3fc7e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -259,10 +259,9 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
order);
}
- /* No multi-function device? */
type = read_pci_config_byte(bus, slot, func,
PCI_HEADER_TYPE);
- if (!(type & 0x80))
+ if (!(type & PCI_HEADER_TYPE_MFD))
break;
}
}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 2ee867d796d9..3bf0487cf3b7 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -4,7 +4,7 @@
#
# Leads to non-deterministic coverage that is not a function of syscall inputs.
-# In particualr, smp_apic_timer_interrupt() is called in random places.
+# In particular, smp_apic_timer_interrupt() is called in random places.
KCOV_INSTRUMENT := n
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 760adac3d1a8..4667bc4b00ab 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -36,6 +36,8 @@
#include <linux/smp.h>
#include <linux/mm.h>
+#include <xen/xen.h>
+
#include <asm/trace/irq_vectors.h>
#include <asm/irq_remapping.h>
#include <asm/pc-conf-reg.h>
@@ -70,7 +72,7 @@ unsigned int num_processors;
unsigned disabled_cpus;
/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid __ro_after_init = -1U;
+u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID;
EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
u8 boot_cpu_apic_version __ro_after_init;
@@ -85,7 +87,7 @@ physid_mask_t phys_cpu_present_map;
* disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
* avoid undefined behaviour caused by sending INIT from AP to BSP.
*/
-static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID;
+static u32 disabled_cpu_apicid __ro_after_init = BAD_APICID;
/*
* This variable controls which CPUs receive external NMIs. By default,
@@ -109,7 +111,7 @@ static inline bool apic_accessible(void)
/*
* Map cpu index to physical APIC ID
*/
-DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
@@ -780,7 +782,7 @@ bool __init apic_needs_pit(void)
/*
* If interrupt delivery mode is legacy PIC or virtual wire without
- * configuration, the local APIC timer wont be set up. Make sure
+ * configuration, the local APIC timer won't be set up. Make sure
* that the PIT is initialized.
*/
if (apic_intr_mode == APIC_PIC ||
@@ -1763,7 +1765,7 @@ static void __x2apic_enable(void)
static int __init setup_nox2apic(char *str)
{
if (x2apic_enabled()) {
- int apicid = native_apic_msr_read(APIC_ID);
+ u32 apicid = native_apic_msr_read(APIC_ID);
if (apicid >= 255) {
pr_warn("Apicid: %08x, cannot enforce nox2apic\n",
@@ -2316,13 +2318,11 @@ static int nr_logical_cpuids = 1;
/*
* Used to store mapping between logical CPU IDs and APIC IDs.
*/
-int cpuid_to_apicid[] = {
- [0 ... NR_CPUS - 1] = -1,
-};
+u32 cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = BAD_APICID, };
bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
{
- return phys_id == cpuid_to_apicid[cpu];
+ return phys_id == (u64)cpuid_to_apicid[cpu];
}
#ifdef CONFIG_SMP
@@ -2344,6 +2344,15 @@ static int __init smp_init_primary_thread_mask(void)
{
unsigned int cpu;
+ /*
+ * XEN/PV provides either none or useless topology information.
+ * Pretend that all vCPUs are primary threads.
+ */
+ if (xen_pv_domain()) {
+ cpumask_copy(&__cpu_primary_thread_mask, cpu_possible_mask);
+ return 0;
+ }
+
for (cpu = 0; cpu < nr_logical_cpuids; cpu++)
cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]);
return 0;
@@ -2382,7 +2391,7 @@ static int allocate_logical_cpuid(int apicid)
return nr_logical_cpuids++;
}
-static void cpu_update_apic(int cpu, int apicid)
+static void cpu_update_apic(int cpu, u32 apicid)
{
#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
@@ -2535,7 +2544,7 @@ static struct {
*/
int active;
/* r/w apic fields */
- unsigned int apic_id;
+ u32 apic_id;
unsigned int apic_taskpri;
unsigned int apic_ldr;
unsigned int apic_dfr;
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
index 7bc5d9bf59cd..8a00141073ea 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -18,7 +18,7 @@ u32 apic_flat_calc_apicid(unsigned int cpu)
return 1U << cpu;
}
-bool default_check_apicid_used(physid_mask_t *map, int apicid)
+bool default_check_apicid_used(physid_mask_t *map, u32 apicid)
{
return physid_isset(apicid, *map);
}
@@ -28,7 +28,7 @@ void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
*retmap = *phys_map;
}
-int default_cpu_present_to_apicid(int mps_cpu)
+u32 default_cpu_present_to_apicid(int mps_cpu)
{
if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
return (int)per_cpu(x86_cpu_to_apicid, mps_cpu);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 032a84e2c3cc..b295a056a4fc 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -56,17 +56,17 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
_flat_send_IPI_mask(mask, vector);
}
-static unsigned int flat_get_apic_id(unsigned long x)
+static u32 flat_get_apic_id(u32 x)
{
return (x >> 24) & 0xFF;
}
-static u32 set_apic_id(unsigned int id)
+static u32 set_apic_id(u32 id)
{
return (id & 0xFF) << 24;
}
-static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
+static u32 flat_phys_pkg_id(u32 initial_apic_id, int index_msb)
{
return initial_apic_id >> index_msb;
}
@@ -82,7 +82,6 @@ static struct apic apic_flat __ro_after_init = {
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
.apic_id_registered = default_apic_id_registered,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
@@ -103,6 +102,7 @@ static struct apic apic_flat __ro_after_init = {
.send_IPI_allbutself = default_send_IPI_allbutself,
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
+ .nmi_to_offline_cpu = true,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
@@ -153,13 +153,10 @@ static struct apic apic_physflat __ro_after_init = {
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
.apic_id_registered = default_apic_id_registered,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
- .check_apicid_used = NULL,
- .ioapic_phys_id_map = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.phys_pkg_id = flat_phys_pkg_id,
@@ -175,6 +172,7 @@ static struct apic apic_physflat __ro_after_init = {
.send_IPI_allbutself = default_send_IPI_allbutself,
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
+ .nmi_to_offline_cpu = true,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 966d7cf10b95..9f1d553eb48f 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -18,6 +18,8 @@
#include <asm/apic.h>
+#include "local.h"
+
static void noop_send_IPI(int cpu, int vector) { }
static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
@@ -25,10 +27,10 @@ static void noop_send_IPI_allbutself(int vector) { }
static void noop_send_IPI_all(int vector) { }
static void noop_send_IPI_self(int vector) { }
static void noop_apic_icr_write(u32 low, u32 id) { }
-static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { return -1; }
+static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; }
static u64 noop_apic_icr_read(void) { return 0; }
-static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; }
-static unsigned int noop_get_apic_id(unsigned long x) { return 0; }
+static u32 noop_phys_pkg_id(u32 cpuid_apic, int index_msb) { return 0; }
+static u32 noop_get_apic_id(u32 apicid) { return 0; }
static void noop_apic_eoi(void) { }
static u32 noop_apic_read(u32 reg)
@@ -45,7 +47,6 @@ static void noop_apic_write(u32 reg, u32 val)
struct apic apic_noop __ro_after_init = {
.name = "noop",
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 63f3d7be9dc7..7d0c51b9d3bc 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -25,7 +25,7 @@ static const struct apic apic_numachip1;
static const struct apic apic_numachip2;
static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly;
-static unsigned int numachip1_get_apic_id(unsigned long x)
+static u32 numachip1_get_apic_id(u32 x)
{
unsigned long value;
unsigned int id = (x >> 24) & 0xff;
@@ -38,12 +38,12 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
return id;
}
-static u32 numachip1_set_apic_id(unsigned int id)
+static u32 numachip1_set_apic_id(u32 id)
{
return (id & 0xff) << 24;
}
-static unsigned int numachip2_get_apic_id(unsigned long x)
+static u32 numachip2_get_apic_id(u32 x)
{
u64 mcfg;
@@ -51,12 +51,12 @@ static unsigned int numachip2_get_apic_id(unsigned long x)
return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
}
-static u32 numachip2_set_apic_id(unsigned int id)
+static u32 numachip2_set_apic_id(u32 id)
{
return id << 24;
}
-static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
+static u32 numachip_phys_pkg_id(u32 initial_apic_id, int index_msb)
{
return initial_apic_id >> index_msb;
}
@@ -71,7 +71,7 @@ static void numachip2_apic_icr_write(int apicid, unsigned int val)
numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
}
-static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip)
{
numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
@@ -161,7 +161,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
u64 val;
u32 nodes = 1;
- this_cpu_write(cpu_llc_id, node);
+ c->topo.llc_id = node;
/* Account for nodes per socket in multi-core-module processors */
if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
@@ -169,7 +169,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
nodes = ((val >> 3) & 7) + 1;
}
- c->phys_proc_id = node / nodes;
+ c->topo.pkg_id = node / nodes;
}
static int __init numachip_system_init(void)
@@ -222,7 +222,6 @@ static const struct apic apic_numachip1 __refconst = {
.probe = numachip1_probe,
.acpi_madt_oem_check = numachip1_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
@@ -259,7 +258,6 @@ static const struct apic apic_numachip2 __refconst = {
.probe = numachip2_probe,
.acpi_madt_oem_check = numachip2_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 0e5535add4b5..5a0d60b38e6b 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -13,12 +13,12 @@
#include "local.h"
-static unsigned bigsmp_get_apic_id(unsigned long x)
+static u32 bigsmp_get_apic_id(u32 x)
{
return (x >> 24) & 0xFF;
}
-static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
+static bool bigsmp_check_apicid_used(physid_mask_t *map, u32 apicid)
{
return false;
}
@@ -29,7 +29,7 @@ static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *re
physids_promote(0xFFL, retmap);
}
-static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
+static u32 bigsmp_phys_pkg_id(u32 cpuid_apic, int index_msb)
{
return cpuid_apic >> index_msb;
}
@@ -80,7 +80,6 @@ static struct apic apic_bigsmp __ro_after_init = {
.name = "bigsmp",
.probe = probe_bigsmp,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 1,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00da6cf6b07d..40c7cf180c20 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -997,7 +997,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
/*
* Legacy ISA IRQ has already been allocated, just add pin to
* the pin list associated with this IRQ and program the IOAPIC
- * entry. The IOAPIC entry
+ * entry.
*/
if (irq_data && irq_data->parent_data) {
if (!mp_check_pin_attr(irq, info))
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index a44ba7209ef3..5da693d633b7 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -97,6 +97,14 @@ sendmask:
__apic_send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
}
+void apic_send_nmi_to_offline_cpu(unsigned int cpu)
+{
+ if (WARN_ON_ONCE(!apic->nmi_to_offline_cpu))
+ return;
+ if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, &cpus_booted_once_mask)))
+ return;
+ apic->send_IPI(cpu, NMI_VECTOR);
+}
#endif /* CONFIG_SMP */
static inline int __prepare_ICR2(unsigned int mask)
@@ -281,7 +289,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
}
#ifdef CONFIG_SMP
-static int convert_apicid_to_cpu(int apic_id)
+static int convert_apicid_to_cpu(u32 apic_id)
{
int i;
@@ -294,7 +302,8 @@ static int convert_apicid_to_cpu(int apic_id)
int safe_smp_processor_id(void)
{
- int apicid, cpuid;
+ u32 apicid;
+ int cpuid;
if (!boot_cpu_has(X86_FEATURE_APIC))
return 0;
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
index ec219c659c7d..9ea6186ea88c 100644
--- a/arch/x86/kernel/apic/local.h
+++ b/arch/x86/kernel/apic/local.h
@@ -15,9 +15,9 @@
/* X2APIC */
void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
-unsigned int x2apic_get_apic_id(unsigned long id);
-u32 x2apic_set_apic_id(unsigned int id);
-int x2apic_phys_pkg_id(int initial_apicid, int index_msb);
+u32 x2apic_get_apic_id(u32 id);
+u32 x2apic_set_apic_id(u32 id);
+u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb);
void x2apic_send_IPI_all(int vector);
void x2apic_send_IPI_allbutself(int vector);
@@ -64,6 +64,7 @@ void default_send_IPI_all(int vector);
void default_send_IPI_self(int vector);
bool default_apic_id_registered(void);
+bool default_check_apicid_used(physid_mask_t *map, u32 apicid);
#ifdef CONFIG_X86_32
void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector);
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 6b6b711678fe..d9651f15ae4f 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -55,14 +55,14 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
* caused by the non-atomic update of the address/data pair.
*
* Direct update is possible when:
- * - The MSI is maskable (remapped MSI does not use this code path)).
- * The quirk bit is not set in this case.
+ * - The MSI is maskable (remapped MSI does not use this code path).
+ * The reservation mode bit is set in this case.
* - The new vector is the same as the old vector
* - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
* - The interrupt is not yet started up
* - The new destination CPU is the same as the old destination CPU
*/
- if (!irqd_msi_nomask_quirk(irqd) ||
+ if (!irqd_can_reserve(irqd) ||
cfg->vector == old_cfg.vector ||
old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
!irqd_is_started(irqd) ||
@@ -215,8 +215,6 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
if (WARN_ON_ONCE(domain != real_parent))
return false;
info->chip->irq_set_affinity = msi_set_affinity;
- /* See msi_set_affinity() for the gory details */
- info->flags |= MSI_FLAG_NOMASK_QUIRK;
break;
case DOMAIN_BUS_DMAR:
case DOMAIN_BUS_AMDVI:
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 9a06df6cdd68..c0f78059f06a 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -18,11 +18,21 @@
#include "local.h"
-static int default_phys_pkg_id(int cpuid_apic, int index_msb)
+static u32 default_phys_pkg_id(u32 cpuid_apic, int index_msb)
{
return cpuid_apic >> index_msb;
}
+static u32 default_get_apic_id(u32 x)
+{
+ unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+
+ if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
+ return (x >> 24) & 0xFF;
+ else
+ return (x >> 24) & 0x0F;
+}
+
/* should be called last. */
static int probe_default(void)
{
@@ -35,7 +45,6 @@ static struct apic apic_default __ro_after_init = {
.probe = probe_default,
.apic_id_registered = default_apic_id_registered,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 319448d87b99..185738c72766 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -738,8 +738,8 @@ int __init arch_probe_nr_irqs(void)
void lapic_assign_legacy_vector(unsigned int irq, bool replace)
{
/*
- * Use assign system here so it wont get accounted as allocated
- * and moveable in the cpu hotplug check and it prevents managed
+ * Use assign system here so it won't get accounted as allocated
+ * and movable in the cpu hotplug check and it prevents managed
* irq reservation from touching it.
*/
irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index affbff65e497..28a7d3f2312d 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -227,7 +227,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
@@ -251,6 +250,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
+ .nmi_to_offline_cpu = true,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 788cdb4ee394..409815a40668 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -124,17 +124,17 @@ static int x2apic_phys_probe(void)
return apic == &apic_x2apic_phys;
}
-unsigned int x2apic_get_apic_id(unsigned long id)
+u32 x2apic_get_apic_id(u32 id)
{
return id;
}
-u32 x2apic_set_apic_id(unsigned int id)
+u32 x2apic_set_apic_id(u32 id)
{
return id;
}
-int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb)
{
return initial_apicid >> index_msb;
}
@@ -145,7 +145,6 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
@@ -166,6 +165,7 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
+ .nmi_to_offline_cpu = true,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d9f5d7492f83..f1766b18dcd0 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -110,7 +110,7 @@ static void __init early_get_pnodeid(void)
} else if (UVH_RH_GAM_ADDR_MAP_CONFIG) {
union uvh_rh_gam_addr_map_config_u m_n_config;
- m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG);
+ m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG);
uv_cpuid.n_skt = m_n_config.s.n_skt;
if (is_uv(UV3))
uv_cpuid.m_skt = m_n_config.s3.m_skt;
@@ -701,7 +701,7 @@ static __init void build_uv_gr_table(void)
}
}
-static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip)
{
unsigned long val;
int pnode;
@@ -779,7 +779,7 @@ static void uv_send_IPI_all(int vector)
uv_send_IPI_mask(cpu_online_mask, vector);
}
-static u32 set_apic_id(unsigned int id)
+static u32 set_apic_id(u32 id)
{
return id;
}
@@ -789,7 +789,7 @@ static unsigned int uv_read_apic_id(void)
return x2apic_get_apic_id(apic_read(APIC_ID));
}
-static int uv_phys_pkg_id(int initial_apicid, int index_msb)
+static u32 uv_phys_pkg_id(u32 initial_apicid, int index_msb)
{
return uv_read_apic_id() >> index_msb;
}
@@ -805,7 +805,6 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
@@ -1533,7 +1532,7 @@ static void __init build_socket_tables(void)
{
struct uv_gam_range_entry *gre = uv_gre_table;
int nums, numn, nump;
- int cpu, i, lnid;
+ int i, lnid, apicid;
int minsock = _min_socket;
int maxsock = _max_socket;
int minpnode = _min_pnode;
@@ -1584,15 +1583,14 @@ static void __init build_socket_tables(void)
/* Set socket -> node values: */
lnid = NUMA_NO_NODE;
- for_each_possible_cpu(cpu) {
- int nid = cpu_to_node(cpu);
- int apicid, sockid;
+ for (apicid = 0; apicid < ARRAY_SIZE(__apicid_to_node); apicid++) {
+ int nid = __apicid_to_node[apicid];
+ int sockid;
- if (lnid == nid)
+ if ((nid == NUMA_NO_NODE) || (lnid == nid))
continue;
lnid = nid;
- apicid = per_cpu(x86_cpu_to_apicid, cpu);
sockid = apicid >> uv_cpuid.socketid_shift;
if (_socket_to_node[sockid - minsock] == SOCK_EMPTY)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5934ee5bc087..76a5ced278c2 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -420,7 +420,7 @@ static DEFINE_MUTEX(apm_mutex);
* This is for buggy BIOS's that refer to (real mode) segment 0x40
* even though they are called in protected mode.
*/
-static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092,
+static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(DESC_DATA32_BIOS,
(unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
static const char driver_version[] = "1.16ac"; /* no spaces */
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index dc3576303f1a..6913b372ccf7 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -68,26 +68,19 @@ static void __used common(void)
#endif
BLANK();
- OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx);
- OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx);
- OFFSET(TDX_MODULE_r8, tdx_module_output, r8);
- OFFSET(TDX_MODULE_r9, tdx_module_output, r9);
- OFFSET(TDX_MODULE_r10, tdx_module_output, r10);
- OFFSET(TDX_MODULE_r11, tdx_module_output, r11);
-
- BLANK();
- OFFSET(TDX_HYPERCALL_r8, tdx_hypercall_args, r8);
- OFFSET(TDX_HYPERCALL_r9, tdx_hypercall_args, r9);
- OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10);
- OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11);
- OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12);
- OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13);
- OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14);
- OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15);
- OFFSET(TDX_HYPERCALL_rdi, tdx_hypercall_args, rdi);
- OFFSET(TDX_HYPERCALL_rsi, tdx_hypercall_args, rsi);
- OFFSET(TDX_HYPERCALL_rbx, tdx_hypercall_args, rbx);
- OFFSET(TDX_HYPERCALL_rdx, tdx_hypercall_args, rdx);
+ OFFSET(TDX_MODULE_rcx, tdx_module_args, rcx);
+ OFFSET(TDX_MODULE_rdx, tdx_module_args, rdx);
+ OFFSET(TDX_MODULE_r8, tdx_module_args, r8);
+ OFFSET(TDX_MODULE_r9, tdx_module_args, r9);
+ OFFSET(TDX_MODULE_r10, tdx_module_args, r10);
+ OFFSET(TDX_MODULE_r11, tdx_module_args, r11);
+ OFFSET(TDX_MODULE_r12, tdx_module_args, r12);
+ OFFSET(TDX_MODULE_r13, tdx_module_args, r13);
+ OFFSET(TDX_MODULE_r14, tdx_module_args, r14);
+ OFFSET(TDX_MODULE_r15, tdx_module_args, r15);
+ OFFSET(TDX_MODULE_rbx, tdx_module_args, rbx);
+ OFFSET(TDX_MODULE_rdi, tdx_module_args, rdi);
+ OFFSET(TDX_MODULE_rsi, tdx_module_args, rsi);
BLANK();
OFFSET(BP_scratch, boot_params, scratch);
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index 2324c7f9a841..cf7e5be1b844 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -50,11 +50,6 @@ EXPORT_SYMBOL_GPL(__x86_call_count);
extern s32 __call_sites[], __call_sites_end[];
-struct thunk_desc {
- void *template;
- unsigned int template_size;
-};
-
struct core_text {
unsigned long base;
unsigned long end;
@@ -245,14 +240,13 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct)
}
static __init_or_module void
-patch_paravirt_call_sites(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end,
- const struct core_text *ct)
+patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end,
+ const struct core_text *ct)
{
- struct paravirt_patch_site *p;
+ struct alt_instr *a;
- for (p = start; p < end; p++)
- patch_call(p->instr, ct);
+ for (a = start; a < end; a++)
+ patch_call((void *)&a->instr_offset + a->instr_offset, ct);
}
static __init_or_module void
@@ -260,7 +254,7 @@ callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct)
{
prdbg("Patching call sites %s\n", ct->name);
patch_call_sites(cs->call_start, cs->call_end, ct);
- patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct);
+ patch_alt_call_sites(cs->alt_start, cs->alt_end, ct);
prdbg("Patching call sites done%s\n", ct->name);
}
@@ -269,8 +263,8 @@ void __init callthunks_patch_builtin_calls(void)
struct callthunk_sites cs = {
.call_start = __call_sites,
.call_end = __call_sites_end,
- .pv_start = __parainstructions,
- .pv_end = __parainstructions_end
+ .alt_start = __alt_instructions,
+ .alt_end = __alt_instructions_end
};
if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
@@ -279,7 +273,6 @@ void __init callthunks_patch_builtin_calls(void)
pr_info("Setting up call depth tracking\n");
mutex_lock(&text_mutex);
callthunks_setup(&cs, &builtin_coretext);
- static_call_force_reinit();
thunks_initialized = true;
mutex_unlock(&text_mutex);
}
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
index 8674a5c0c031..e6bf78fac146 100644
--- a/arch/x86/kernel/cfi.c
+++ b/arch/x86/kernel/cfi.c
@@ -4,10 +4,10 @@
*
* Copyright (C) 2022 Google LLC
*/
-#include <asm/cfi.h>
+#include <linux/string.h>
+#include <linux/cfi.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
-#include <linux/string.h>
/*
* Returns the target address and the expected type when regs->ip points
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4350f6bfc064..93eabf544031 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -54,6 +54,8 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
obj-$(CONFIG_ACRN_GUEST) += acrn.o
+obj-$(CONFIG_DEBUG_FS) += debugfs.o
+
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index dd8379d84445..f3abca334199 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -34,83 +34,6 @@
*/
static u32 nodes_per_socket = 1;
-/*
- * AMD errata checking
- *
- * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
- * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
- * have an OSVW id assigned, which it takes as first argument. Both take a
- * variable number of family-specific model-stepping ranges created by
- * AMD_MODEL_RANGE().
- *
- * Example:
- *
- * const int amd_erratum_319[] =
- * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
- * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
- * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
- */
-
-#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
-#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
-#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
- ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
-#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
-#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
-#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
-
-static const int amd_erratum_400[] =
- AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
- AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
-
-static const int amd_erratum_383[] =
- AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
-
-/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
-static const int amd_erratum_1054[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
-
-static const int amd_zenbleed[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf),
- AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf));
-
-static const int amd_div0[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf));
-
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
-{
- int osvw_id = *erratum++;
- u32 range;
- u32 ms;
-
- if (osvw_id >= 0 && osvw_id < 65536 &&
- cpu_has(cpu, X86_FEATURE_OSVW)) {
- u64 osvw_len;
-
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
- if (osvw_id < osvw_len) {
- u64 osvw_bits;
-
- rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
- osvw_bits);
- return osvw_bits & (1ULL << (osvw_id & 0x3f));
- }
- }
-
- /* OSVW unavailable or ID unknown, match family-model-stepping range */
- ms = (cpu->x86_model << 4) | cpu->x86_stepping;
- while ((range = *erratum++))
- if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
- (ms >= AMD_MODEL_RANGE_START(range)) &&
- (ms <= AMD_MODEL_RANGE_END(range)))
- return true;
-
- return false;
-}
-
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -378,7 +301,7 @@ static int nearby_node(int apicid)
#endif
/*
- * Fix up cpu_core_id for pre-F17h systems to be in the
+ * Fix up topo::core_id for pre-F17h systems to be in the
* [0 .. cores_per_node - 1] range. Not really needed but
* kept so as not to break existing setups.
*/
@@ -390,7 +313,7 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
return;
cus_per_node = c->x86_max_cores / nodes_per_socket;
- c->cpu_core_id %= cus_per_node;
+ c->topo.core_id %= cus_per_node;
}
/*
@@ -401,8 +324,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
*/
static void amd_get_topology(struct cpuinfo_x86 *c)
{
- int cpu = smp_processor_id();
-
/* get information required for multi-node processors */
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
int err;
@@ -410,13 +331,13 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- c->cpu_die_id = ecx & 0xff;
+ c->topo.die_id = ecx & 0xff;
if (c->x86 == 0x15)
- c->cu_id = ebx & 0xff;
+ c->topo.cu_id = ebx & 0xff;
if (c->x86 >= 0x17) {
- c->cpu_core_id = ebx & 0xff;
+ c->topo.core_id = ebx & 0xff;
if (smp_num_siblings > 1)
c->x86_max_cores /= smp_num_siblings;
@@ -430,15 +351,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
if (!err)
c->x86_coreid_bits = get_count_order(c->x86_max_cores);
- cacheinfo_amd_init_llc_id(c, cpu);
+ cacheinfo_amd_init_llc_id(c);
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- c->cpu_die_id = value & 7;
-
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
+ c->topo.die_id = value & 7;
+ c->topo.llc_id = c->topo.die_id;
} else
return;
@@ -455,15 +375,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
unsigned bits;
- int cpu = smp_processor_id();
bits = c->x86_coreid_bits;
/* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1);
/* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
+ c->topo.pkg_id = c->topo.initial_apicid >> bits;
/* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
+ c->topo.llc_id = c->topo.die_id = c->topo.pkg_id;
}
u32 amd_get_nodes_per_socket(void)
@@ -477,11 +396,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
int node;
- unsigned apicid = c->apicid;
+ unsigned apicid = c->topo.apicid;
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
- node = get_llc_id(cpu);
+ node = per_cpu_llc_id(cpu);
/*
* On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -511,7 +430,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
* through CPU mapping may alter the outcome, directly
* access __apicid_to_node[].
*/
- int ht_nodeid = c->initial_apicid;
+ int ht_nodeid = c->topo.initial_apicid;
if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
@@ -616,6 +535,62 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
resctrl_cpu_detect(c);
+
+ /* Figure out Zen generations: */
+ switch (c->x86) {
+ case 0x17:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x50 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN1);
+ break;
+ case 0x30 ... 0x4f:
+ case 0x60 ... 0x7f:
+ case 0x90 ... 0x91:
+ case 0xa0 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN2);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x19:
+ switch (c->x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN3);
+ break;
+ case 0x10 ... 0x1f:
+ case 0x60 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN4);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x1a:
+ switch (c->x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x2f:
+ case 0x40 ... 0x4f:
+ case 0x70 ... 0x7f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN5);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return;
+
+warn:
+ WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model);
}
static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -739,15 +714,6 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (c->x86 == 0x16 && c->x86_model <= 0xf)
msr_set_bit(MSR_AMD64_LS_CFG, 15);
- /*
- * Check whether the machine is affected by erratum 400. This is
- * used to select the proper idle routine and to enable the check
- * whether the machine is affected in arch_post_acpi_init(), which
- * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
- */
- if (cpu_has_amd_erratum(c, amd_erratum_400))
- set_cpu_bug(c, X86_BUG_AMD_E400);
-
early_detect_mem_encrypt(c);
/* Re-enable TopologyExtensions if switched off by BIOS */
@@ -766,6 +732,15 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_TOPOEXT))
smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) {
+ if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ setup_force_cpu_cap(X86_FEATURE_SBPB);
+ }
+ }
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -805,6 +780,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
msr_set_bit(MSR_K7_HWCR, 6);
#endif
set_cpu_bug(c, X86_BUG_SWAPGS_FENCE);
+
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x41 ||
+ (c->x86_model == 0x41 && c->x86_stepping >= 0x2))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
}
static void init_amd_gh(struct cpuinfo_x86 *c)
@@ -838,8 +823,17 @@ static void init_amd_gh(struct cpuinfo_x86 *c)
*/
msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- if (cpu_has_amd_erratum(c, amd_erratum_383))
- set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x2 ||
+ (c->x86_model == 0x2 && c->x86_stepping >= 0x1))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
}
static void init_amd_ln(struct cpuinfo_x86 *c)
@@ -932,6 +926,19 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
+static void fix_erratum_1386(struct cpuinfo_x86 *c)
+{
+ /*
+ * Work around Erratum 1386. The XSAVES instruction malfunctions in
+ * certain circumstances on Zen1/2 uarch, and not all parts have had
+ * updated microcode at the time of writing (March 2023).
+ *
+ * Affected parts all have no supervisor XSAVE states, meaning that
+ * the XSAVEC instruction (which works fine) is equivalent.
+ */
+ clear_cpu_cap(c, X86_FEATURE_XSAVES);
+}
+
void init_spectral_chicken(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_CPU_UNRET_ENTRY
@@ -942,34 +949,28 @@ void init_spectral_chicken(struct cpuinfo_x86 *c)
*
* This suppresses speculation from the middle of a basic block, i.e. it
* suppresses non-branch predictions.
- *
- * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H
*/
- if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT;
wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
}
}
#endif
- /*
- * Work around Erratum 1386. The XSAVES instruction malfunctions in
- * certain circumstances on Zen1/2 uarch, and not all parts have had
- * updated microcode at the time of writing (March 2023).
- *
- * Affected parts all have no supervisor XSAVE states, meaning that
- * the XSAVEC instruction (which works fine) is equivalent.
- */
- clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
-static void init_amd_zn(struct cpuinfo_x86 *c)
+static void init_amd_zen_common(void)
{
- set_cpu_cap(c, X86_FEATURE_ZEN);
-
+ setup_force_cpu_cap(X86_FEATURE_ZEN);
#ifdef CONFIG_NUMA
node_reclaim_distance = 32;
#endif
+}
+
+static void init_amd_zen1(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+ fix_erratum_1386(c);
/* Fix up CPUID bits, but only if not virtualised. */
if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
@@ -977,15 +978,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
/* Erratum 1076: CPB feature bit not being set in CPUID. */
if (!cpu_has(c, X86_FEATURE_CPB))
set_cpu_cap(c, X86_FEATURE_CPB);
-
- /*
- * Zen3 (Fam19 model < 0x10) parts are not susceptible to
- * Branch Type Confusion, but predate the allocation of the
- * BTC_NO bit.
- */
- if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO))
- set_cpu_cap(c, X86_FEATURE_BTC_NO);
}
+
+ pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
+ setup_force_cpu_bug(X86_BUG_DIV0);
}
static bool cpu_has_zenbleed_microcode(void)
@@ -1001,7 +997,6 @@ static bool cpu_has_zenbleed_microcode(void)
default:
return false;
- break;
}
if (boot_cpu_data.microcode < good_rev)
@@ -1010,11 +1005,8 @@ static bool cpu_has_zenbleed_microcode(void)
return true;
}
-static void zenbleed_check(struct cpuinfo_x86 *c)
+static void zen2_zenbleed_check(struct cpuinfo_x86 *c)
{
- if (!cpu_has_amd_erratum(c, amd_zenbleed))
- return;
-
if (cpu_has(c, X86_FEATURE_HYPERVISOR))
return;
@@ -1029,8 +1021,46 @@ static void zenbleed_check(struct cpuinfo_x86 *c)
}
}
+static void init_amd_zen2(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+ init_spectral_chicken(c);
+ fix_erratum_1386(c);
+ zen2_zenbleed_check(c);
+}
+
+static void init_amd_zen3(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ /*
+ * Zen3 (Fam19 model < 0x10) parts are not susceptible to
+ * Branch Type Confusion, but predate the allocation of the
+ * BTC_NO bit.
+ */
+ if (!cpu_has(c, X86_FEATURE_BTC_NO))
+ set_cpu_cap(c, X86_FEATURE_BTC_NO);
+ }
+}
+
+static void init_amd_zen4(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+}
+
+static void init_amd_zen5(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+}
+
static void init_amd(struct cpuinfo_x86 *c)
{
+ u64 vm_cr;
+
early_init_amd(c);
/*
@@ -1047,7 +1077,7 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_FSRS);
/* get apicid instead of initial apic id from cpuid */
- c->apicid = read_apic_id();
+ c->topo.apicid = read_apic_id();
/* K6s reports MCEs but don't actually have all the MSRs */
if (c->x86 < 6)
@@ -1062,11 +1092,19 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
case 0x16: init_amd_jg(c); break;
- case 0x17: init_spectral_chicken(c);
- fallthrough;
- case 0x19: init_amd_zn(c); break;
}
+ if (boot_cpu_has(X86_FEATURE_ZEN1))
+ init_amd_zen1(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN2))
+ init_amd_zen2(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN3))
+ init_amd_zen3(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN4))
+ init_amd_zen4(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN5))
+ init_amd_zen5(c);
+
/*
* Enable workaround for FXSAVE leak on CPUs
* without a XSaveErPtr feature
@@ -1082,6 +1120,14 @@ static void init_amd(struct cpuinfo_x86 *c)
init_amd_cacheinfo(c);
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
+ }
+ }
+
if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) {
/*
* Use LFENCE for execution serialization. On families which
@@ -1118,7 +1164,7 @@ static void init_amd(struct cpuinfo_x86 *c)
* Counter May Be Inaccurate".
*/
if (cpu_has(c, X86_FEATURE_IRPERF) &&
- !cpu_has_amd_erratum(c, amd_erratum_1054))
+ (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
check_null_seg_clears_base(c);
@@ -1134,12 +1180,8 @@ static void init_amd(struct cpuinfo_x86 *c)
cpu_has(c, X86_FEATURE_AUTOIBRS))
WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS));
- zenbleed_check(c);
-
- if (cpu_has_amd_erratum(c, amd_div0)) {
- pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
- setup_force_cpu_bug(X86_BUG_DIV0);
- }
+ /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
#ifdef CONFIG_X86_32
@@ -1293,31 +1335,15 @@ static void zenbleed_check_cpu(void *unused)
{
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- zenbleed_check(c);
+ zen2_zenbleed_check(c);
}
void amd_check_microcode(void)
{
- on_each_cpu(zenbleed_check_cpu, NULL, 1);
-}
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return;
-bool cpu_has_ibpb_brtype_microcode(void)
-{
- switch (boot_cpu_data.x86) {
- /* Zen1/2 IBPB flushes branch type predictions too. */
- case 0x17:
- return boot_cpu_has(X86_FEATURE_AMD_IBPB);
- case 0x19:
- /* Poke the MSR bit on Zen3/4 to check its presence. */
- if (!wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
- setup_force_cpu_cap(X86_FEATURE_SBPB);
- return true;
- } else {
- return false;
- }
- default:
- return false;
- }
+ on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
/*
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index f081d26616ac..bb0ab8466b91 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(x86_pred_cmd);
static DEFINE_MUTEX(spec_ctrl_mutex);
-void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
+void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
/* Update SPEC_CTRL MSR and its cached copy unconditionally */
static void update_spec_ctrl(u64 val)
@@ -717,7 +717,7 @@ void update_gds_msr(void)
case GDS_MITIGATION_UCODE_NEEDED:
case GDS_MITIGATION_HYPERVISOR:
return;
- };
+ }
wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
@@ -1019,7 +1019,6 @@ static void __init retbleed_select_mitigation(void)
do_cmd_auto:
case RETBLEED_CMD_AUTO:
- default:
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY))
@@ -1042,8 +1041,7 @@ do_cmd_auto:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_UNRET);
- if (IS_ENABLED(CONFIG_RETHUNK))
- x86_return_thunk = retbleed_return_thunk;
+ x86_return_thunk = retbleed_return_thunk;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
@@ -1061,7 +1059,8 @@ do_cmd_auto:
case RETBLEED_MITIGATION_STUFF:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
- x86_set_skl_return_thunk();
+
+ x86_return_thunk = call_depth_return_thunk;
break;
default:
@@ -1290,6 +1289,8 @@ spectre_v2_user_select_mitigation(void)
spectre_v2_user_ibpb = mode;
switch (cmd) {
+ case SPECTRE_V2_USER_CMD_NONE:
+ break;
case SPECTRE_V2_USER_CMD_FORCE:
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
@@ -1301,8 +1302,6 @@ spectre_v2_user_select_mitigation(void)
case SPECTRE_V2_USER_CMD_SECCOMP:
static_branch_enable(&switch_mm_cond_ibpb);
break;
- default:
- break;
}
pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
@@ -2160,6 +2159,10 @@ static int l1d_flush_prctl_get(struct task_struct *task)
static int ssb_prctl_get(struct task_struct *task)
{
switch (ssb_mode) {
+ case SPEC_STORE_BYPASS_NONE:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ return PR_SPEC_ENABLE;
+ return PR_SPEC_NOT_AFFECTED;
case SPEC_STORE_BYPASS_DISABLE:
return PR_SPEC_DISABLE;
case SPEC_STORE_BYPASS_SECCOMP:
@@ -2171,11 +2174,8 @@ static int ssb_prctl_get(struct task_struct *task)
if (task_spec_ssb_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
- default:
- if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
- return PR_SPEC_ENABLE;
- return PR_SPEC_NOT_AFFECTED;
}
+ BUG();
}
static int ib_prctl_get(struct task_struct *task)
@@ -2353,6 +2353,8 @@ early_param("l1tf", l1tf_cmdline);
enum srso_mitigation {
SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_UCODE_NEEDED,
+ SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
SRSO_MITIGATION_MICROCODE,
SRSO_MITIGATION_SAFE_RET,
SRSO_MITIGATION_IBPB,
@@ -2368,11 +2370,13 @@ enum srso_mitigation_cmd {
};
static const char * const srso_strings[] = {
- [SRSO_MITIGATION_NONE] = "Vulnerable",
- [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode",
- [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET",
- [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
- [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only"
+ [SRSO_MITIGATION_NONE] = "Vulnerable",
+ [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode",
+ [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
+ [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
+ [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only"
};
static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE;
@@ -2404,46 +2408,45 @@ early_param("spec_rstack_overflow", srso_parse_cmdline);
static void __init srso_select_mitigation(void)
{
- bool has_microcode;
+ bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
- if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
- goto pred_cmd;
+ if (cpu_mitigations_off())
+ return;
- /*
- * The first check is for the kernel running as a guest in order
- * for guests to verify whether IBPB is a viable mitigation.
- */
- has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) || cpu_has_ibpb_brtype_microcode();
- if (!has_microcode) {
- pr_warn("IBPB-extending microcode not applied!\n");
- pr_warn(SRSO_NOTICE);
- } else {
- /*
- * Enable the synthetic (even if in a real CPUID leaf)
- * flags for guests.
- */
- setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
+ if (boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
+ return;
+ }
+ if (has_microcode) {
/*
* Zen1/2 with SMT off aren't vulnerable after the right
* IBPB microcode has been applied.
+ *
+ * Zen1/2 don't have SBPB, no need to try to enable it here.
*/
if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
return;
}
- }
- if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
- if (has_microcode) {
- pr_err("Retbleed IBPB mitigation enabled, using same for SRSO\n");
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
srso_mitigation = SRSO_MITIGATION_IBPB;
- goto pred_cmd;
+ goto out;
}
+ } else {
+ pr_warn("IBPB-extending microcode not applied!\n");
+ pr_warn(SRSO_NOTICE);
+
+ /* may be overwritten by SRSO_CMD_SAFE_RET below */
+ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
}
switch (srso_cmd) {
case SRSO_CMD_OFF:
+ if (boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
return;
case SRSO_CMD_MICROCODE:
@@ -2469,10 +2472,12 @@ static void __init srso_select_mitigation(void)
setup_force_cpu_cap(X86_FEATURE_SRSO);
x86_return_thunk = srso_return_thunk;
}
- srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ if (has_microcode)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ else
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
} else {
pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
- goto pred_cmd;
}
break;
@@ -2484,7 +2489,6 @@ static void __init srso_select_mitigation(void)
}
} else {
pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
- goto pred_cmd;
}
break;
@@ -2496,20 +2500,12 @@ static void __init srso_select_mitigation(void)
}
} else {
pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
- goto pred_cmd;
}
break;
-
- default:
- break;
}
- pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode"));
-
-pred_cmd:
- if ((boot_cpu_has(X86_FEATURE_SRSO_NO) || srso_cmd == SRSO_CMD_OFF) &&
- boot_cpu_has(X86_FEATURE_SBPB))
- x86_pred_cmd = PRED_CMD_SBPB;
+out:
+ pr_info("%s\n", srso_strings[srso_mitigation]);
}
#undef pr_fmt
@@ -2715,9 +2711,7 @@ static ssize_t srso_show_state(char *buf)
if (boot_cpu_has(X86_FEATURE_SRSO_NO))
return sysfs_emit(buf, "Mitigation: SMT disabled\n");
- return sysfs_emit(buf, "%s%s\n",
- srso_strings[srso_mitigation],
- (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode"));
+ return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]);
}
static ssize_t gds_show_state(char *buf)
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 8f86eacf69f7..c131c412db89 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -661,7 +661,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
return i;
}
-void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c)
{
/*
* We may have multiple LLCs if L3 caches exist, so check if we
@@ -672,13 +672,13 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
if (c->x86 < 0x17) {
/* LLC is at the node level. */
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
+ c->topo.llc_id = c->topo.die_id;
} else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
/*
* LLC is at the core complex level.
* Core complex ID is ApicId[3] for these processors.
*/
- per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ c->topo.llc_id = c->topo.apicid >> 3;
} else {
/*
* LLC ID is calculated from the number of threads sharing the
@@ -694,12 +694,12 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
if (num_sharing_cache) {
int bits = get_count_order(num_sharing_cache);
- per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
+ c->topo.llc_id = c->topo.apicid >> bits;
}
}
}
-void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu)
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c)
{
/*
* We may have multiple LLCs if L3 caches exist, so check if we
@@ -712,7 +712,7 @@ void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu)
* LLC is at the core complex level.
* Core complex ID is ApicId[3] for these processors.
*/
- per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ c->topo.llc_id = c->topo.apicid >> 3;
}
void init_amd_cacheinfo(struct cpuinfo_x86 *c)
@@ -740,9 +740,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_SMP
- unsigned int cpu = c->cpu_index;
-#endif
if (c->cpuid_level > 3) {
static int is_initialized;
@@ -776,13 +773,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
new_l2 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
- l2_id = c->apicid & ~((1 << index_msb) - 1);
+ l2_id = c->topo.apicid & ~((1 << index_msb) - 1);
break;
case 3:
new_l3 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
- l3_id = c->apicid & ~((1 << index_msb) - 1);
+ l3_id = c->topo.apicid & ~((1 << index_msb) - 1);
break;
default:
break;
@@ -856,30 +853,24 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
if (new_l2) {
l2 = new_l2;
-#ifdef CONFIG_SMP
- per_cpu(cpu_llc_id, cpu) = l2_id;
- per_cpu(cpu_l2c_id, cpu) = l2_id;
-#endif
+ c->topo.llc_id = l2_id;
+ c->topo.l2c_id = l2_id;
}
if (new_l3) {
l3 = new_l3;
-#ifdef CONFIG_SMP
- per_cpu(cpu_llc_id, cpu) = l3_id;
-#endif
+ c->topo.llc_id = l3_id;
}
-#ifdef CONFIG_SMP
/*
- * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
+ * If llc_id is not yet set, this means cpuid_level < 4 which in
* turns means that the only possibility is SMT (as indicated in
* cpuid1). Since cpuid2 doesn't specify shared caches, and we know
* that SMT shares all caches, we can unconditionally set cpu_llc_id to
- * c->phys_proc_id.
+ * c->topo.pkg_id.
*/
- if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
- per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-#endif
+ if (c->topo.llc_id == BAD_APICID)
+ c->topo.llc_id = c->topo.pkg_id;
c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
@@ -915,7 +906,7 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
unsigned int apicid, nshared, first, last;
nshared = base->eax.split.num_threads_sharing + 1;
- apicid = cpu_data(cpu).apicid;
+ apicid = cpu_data(cpu).topo.apicid;
first = apicid - (apicid % nshared);
last = first + nshared - 1;
@@ -924,14 +915,14 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
if (!this_cpu_ci->info_list)
continue;
- apicid = cpu_data(i).apicid;
+ apicid = cpu_data(i).topo.apicid;
if ((apicid < first) || (apicid > last))
continue;
this_leaf = this_cpu_ci->info_list + index;
for_each_online_cpu(sibling) {
- apicid = cpu_data(sibling).apicid;
+ apicid = cpu_data(sibling).topo.apicid;
if ((apicid < first) || (apicid > last))
continue;
cpumask_set_cpu(sibling,
@@ -969,7 +960,7 @@ static void __cache_cpumap_setup(unsigned int cpu, int index,
index_msb = get_count_order(num_threads_sharing);
for_each_online_cpu(i)
- if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+ if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) {
struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
if (i == cpu || !sib_cpu_ci->info_list)
@@ -1024,7 +1015,7 @@ static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
- id4_regs->id = c->apicid >> index_msb;
+ id4_regs->id = c->topo.apicid >> index_msb;
}
int populate_cache_leaves(unsigned int cpu)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4cc0ab0dfbb5..8f367d376520 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -62,9 +62,11 @@
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
#include <asm/uv/uv.h>
+#include <asm/ia32.h>
#include <asm/set_memory.h>
#include <asm/traps.h>
#include <asm/sev.h>
+#include <asm/tdx.h>
#include "cpu.h"
@@ -74,18 +76,6 @@ u32 elf_hwcap2 __read_mostly;
int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
-/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
-
-u16 get_llc_id(unsigned int cpu)
-{
- return per_cpu(cpu_llc_id, cpu);
-}
-EXPORT_SYMBOL_GPL(get_llc_id);
-
-/* L2 cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
-
static struct ppin_info {
int feature;
int msr_ppin_ctl;
@@ -199,45 +189,37 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
* TLS descriptors are currently at a different place compared to i386.
* Hopefully nobody expects them at a fixed place (Wine?)
*/
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER, 0, 0xfffff),
#else
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA32 | DESC_USER, 0, 0xfffff),
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
- /* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
+ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
+ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
- /* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
-
- [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(DESC_DATA32_BIOS, 0, 0xffff),
+
+ [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
@@ -914,7 +896,7 @@ void detect_ht(struct cpuinfo_x86 *c)
return;
index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+ c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb);
smp_num_siblings = smp_num_siblings / c->x86_max_cores;
@@ -922,8 +904,8 @@ void detect_ht(struct cpuinfo_x86 *c)
core_bits = get_count_order(c->x86_max_cores);
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
+ c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb) &
+ ((1 << core_bits) - 1);
#endif
}
@@ -1114,18 +1096,34 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
+ bool vp_bits_from_cpuid = true;
- if (c->extended_cpuid_level >= 0x80000008) {
+ if (!cpu_has(c, X86_FEATURE_CPUID) ||
+ (c->extended_cpuid_level < 0x80000008))
+ vp_bits_from_cpuid = false;
+
+ if (vp_bits_from_cpuid) {
cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
+ } else {
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
+ } else {
+ c->x86_clflush_size = 32;
+ c->x86_virt_bits = 32;
+ c->x86_phys_bits = 32;
+
+ if (cpu_has(c, X86_FEATURE_PAE) ||
+ cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
+ }
}
-#ifdef CONFIG_X86_32
- else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
- c->x86_phys_bits = 36;
-#endif
c->x86_cache_bits = c->x86_phys_bits;
+ c->x86_cache_alignment = c->x86_clflush_size;
}
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -1303,7 +1301,7 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
- VULNBL_HYGON(0x18, RETBLEED | SMT_RSB),
+ VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
VULNBL_AMD(0x19, SRSO),
{}
};
@@ -1579,17 +1577,6 @@ static void __init cpu_parse_early_param(void)
*/
static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_64
- c->x86_clflush_size = 64;
- c->x86_phys_bits = 36;
- c->x86_virt_bits = 48;
-#else
- c->x86_clflush_size = 32;
- c->x86_phys_bits = 32;
- c->x86_virt_bits = 32;
-#endif
- c->x86_cache_alignment = c->x86_clflush_size;
-
memset(&c->x86_capability, 0, sizeof(c->x86_capability));
c->extended_cpuid_level = 0;
@@ -1601,7 +1588,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
cpu_detect(c);
get_cpu_vendor(c);
get_cpu_cap(c);
- get_cpu_address_sizes(c);
setup_force_cpu_cap(X86_FEATURE_CPUID);
cpu_parse_early_param();
@@ -1617,6 +1603,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_CPUID);
}
+ get_cpu_address_sizes(c);
+
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
cpu_set_bug_bits(c);
@@ -1761,15 +1749,15 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_cpu_address_sizes(c);
if (c->cpuid_level >= 0x00000001) {
- c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
+ c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
#ifdef CONFIG_X86_32
# ifdef CONFIG_SMP
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+ c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
# else
- c->apicid = c->initial_apicid;
+ c->topo.apicid = c->topo.initial_apicid;
# endif
#endif
- c->phys_proc_id = c->initial_apicid;
+ c->topo.pkg_id = c->topo.initial_apicid;
}
get_model_name(c); /* Default name */
@@ -1799,18 +1787,19 @@ static void generic_identify(struct cpuinfo_x86 *c)
static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
- unsigned int apicid, cpu = smp_processor_id();
+ unsigned int cpu = smp_processor_id();
+ u32 apicid;
apicid = apic->cpu_present_to_apicid(cpu);
- if (apicid != c->apicid) {
+ if (apicid != c->topo.apicid) {
pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
- cpu, apicid, c->initial_apicid);
+ cpu, apicid, c->topo.initial_apicid);
}
- BUG_ON(topology_update_package_map(c->phys_proc_id, cpu));
- BUG_ON(topology_update_die_map(c->cpu_die_id, cpu));
+ BUG_ON(topology_update_package_map(c->topo.pkg_id, cpu));
+ BUG_ON(topology_update_die_map(c->topo.die_id, cpu));
#else
- c->logical_proc_id = 0;
+ c->topo.logical_pkg_id = 0;
#endif
}
@@ -1829,7 +1818,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
c->x86_model_id[0] = '\0'; /* Unset */
c->x86_max_cores = 1;
c->x86_coreid_bits = 0;
- c->cu_id = 0xff;
+ c->topo.cu_id = 0xff;
+ c->topo.llc_id = BAD_APICID;
+ c->topo.l2c_id = BAD_APICID;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1855,9 +1846,16 @@ static void identify_cpu(struct cpuinfo_x86 *c)
apply_forced_caps(c);
#ifdef CONFIG_X86_64
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+ c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
#endif
+
+ /*
+ * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
+ * Hygon will clear it in ->c_init() below.
+ */
+ set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
/*
* Vendor-specific initialization. In this section we
* canonicalize the feature flags, meaning if there are
@@ -1989,6 +1987,7 @@ static __init void identify_boot_cpu(void)
setup_cr_pinning();
tsx_init();
+ tdx_init();
lkgs_init();
}
@@ -2075,24 +2074,24 @@ void syscall_init(void)
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
-#ifdef CONFIG_IA32_EMULATION
- wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
- /*
- * This only works on Intel CPUs.
- * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
- * This does not cause SYSENTER to jump to the wrong location, because
- * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
- */
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
- (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
-#else
- wrmsrl_cstar((unsigned long)ignore_sysret);
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
-#endif
+ if (ia32_enabled()) {
+ wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+ } else {
+ wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+ }
/*
* Flags to clear on syscall; clear as much as possible
@@ -2167,8 +2166,6 @@ static inline void setup_getcpu(int cpu)
}
#ifdef CONFIG_X86_64
-static inline void ucode_cpu_init(int cpu) { }
-
static inline void tss_setup_ist(struct tss_struct *tss)
{
/* Set up the per-CPU TSS IST stacks */
@@ -2179,16 +2176,8 @@ static inline void tss_setup_ist(struct tss_struct *tss)
/* Only mapped when SEV-ES is active */
tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
}
-
#else /* CONFIG_X86_64 */
-
-static inline void ucode_cpu_init(int cpu)
-{
- show_ucode_info_early();
-}
-
static inline void tss_setup_ist(struct tss_struct *tss) { }
-
#endif /* !CONFIG_X86_64 */
static inline void tss_setup_io_bitmap(struct tss_struct *tss)
@@ -2244,8 +2233,6 @@ void cpu_init(void)
struct task_struct *cur = current;
int cpu = raw_smp_processor_id();
- ucode_cpu_init(cpu);
-
#ifdef CONFIG_NUMA
if (this_cpu_read(numa_node) == 0 &&
early_cpu_to_node(cpu) != NUMA_NO_NODE)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 1dcd7d4e38ef..885281ae79a5 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -78,6 +78,9 @@ extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c);
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c);
+
unsigned int aperfmperf_get_khz(int cpu);
void cpu_select_mitigations(void);
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
new file mode 100644
index 000000000000..0c179d684b3b
--- /dev/null
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/debugfs.h>
+
+#include <asm/apic.h>
+#include <asm/processor.h>
+
+static int cpu_debug_show(struct seq_file *m, void *p)
+{
+ unsigned long cpu = (unsigned long)m->private;
+ struct cpuinfo_x86 *c = per_cpu_ptr(&cpu_info, cpu);
+
+ seq_printf(m, "online: %d\n", cpu_online(cpu));
+ if (!c->initialized)
+ return 0;
+
+ seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid);
+ seq_printf(m, "apicid: %x\n", c->topo.apicid);
+ seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id);
+ seq_printf(m, "die_id: %u\n", c->topo.die_id);
+ seq_printf(m, "cu_id: %u\n", c->topo.cu_id);
+ seq_printf(m, "core_id: %u\n", c->topo.core_id);
+ seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
+ seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
+ seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
+ seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
+ seq_printf(m, "max_cores: %u\n", c->x86_max_cores);
+ seq_printf(m, "max_die_per_pkg: %u\n", __max_die_per_package);
+ seq_printf(m, "smp_num_siblings: %u\n", smp_num_siblings);
+ return 0;
+}
+
+static int cpu_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cpu_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_cpu_ops = {
+ .open = cpu_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int cpu_init_debugfs(void)
+{
+ struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir);
+ unsigned long id;
+ char name[24];
+
+ dir = debugfs_create_dir("cpus", base);
+ for_each_possible_cpu(id) {
+ sprintf(name, "%lu", id);
+ debugfs_create_file(name, 0444, dir, (void *)id, &dfs_cpu_ops);
+ }
+ return 0;
+}
+late_initcall(cpu_init_debugfs);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index defdc594be14..f0cd95502faa 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -63,8 +63,6 @@ static void hygon_get_topology_early(struct cpuinfo_x86 *c)
*/
static void hygon_get_topology(struct cpuinfo_x86 *c)
{
- int cpu = smp_processor_id();
-
/* get information required for multi-node processors */
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
int err;
@@ -72,9 +70,9 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- c->cpu_die_id = ecx & 0xff;
+ c->topo.die_id = ecx & 0xff;
- c->cpu_core_id = ebx & 0xff;
+ c->topo.core_id = ebx & 0xff;
if (smp_num_siblings > 1)
c->x86_max_cores /= smp_num_siblings;
@@ -87,17 +85,20 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
if (!err)
c->x86_coreid_bits = get_count_order(c->x86_max_cores);
- /* Socket ID is ApicId[6] for these processors. */
- c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT;
+ /*
+ * Socket ID is ApicId[6] for the processors with model <= 0x3
+ * when running on host.
+ */
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3)
+ c->topo.pkg_id = c->topo.apicid >> APICID_SOCKET_ID_BIT;
- cacheinfo_hygon_init_llc_id(c, cpu);
+ cacheinfo_hygon_init_llc_id(c);
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- c->cpu_die_id = value & 7;
-
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
+ c->topo.die_id = value & 7;
+ c->topo.llc_id = c->topo.die_id;
} else
return;
@@ -112,15 +113,14 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
static void hygon_detect_cmp(struct cpuinfo_x86 *c)
{
unsigned int bits;
- int cpu = smp_processor_id();
bits = c->x86_coreid_bits;
/* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1);
/* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
- /* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
+ c->topo.pkg_id = c->topo.initial_apicid >> bits;
+ /* Use package ID also for last level cache */
+ c->topo.llc_id = c->topo.die_id = c->topo.pkg_id;
}
static void srat_detect_node(struct cpuinfo_x86 *c)
@@ -128,11 +128,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
int node;
- unsigned int apicid = c->apicid;
+ unsigned int apicid = c->topo.apicid;
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
- node = per_cpu(cpu_llc_id, cpu);
+ node = c->topo.llc_id;
/*
* On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -161,7 +161,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
* through CPU mapping may alter the outcome, directly
* access __apicid_to_node[].
*/
- int ht_nodeid = c->initial_apicid;
+ int ht_nodeid = c->topo.initial_apicid;
if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
@@ -290,6 +290,8 @@ static void early_init_hygon(struct cpuinfo_x86 *c)
static void init_hygon(struct cpuinfo_x86 *c)
{
+ u64 vm_cr;
+
early_init_hygon(c);
/*
@@ -301,7 +303,7 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/* get apicid instead of initial apic id from cpuid */
- c->apicid = read_apic_id();
+ c->topo.apicid = read_apic_id();
/*
* XXX someone from Hygon needs to confirm this DTRT
@@ -320,6 +322,14 @@ static void init_hygon(struct cpuinfo_x86 *c)
init_hygon_cacheinfo(c);
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
+ }
+ }
+
if (cpu_has(c, X86_FEATURE_XMM2)) {
/*
* Use LFENCE for execution serialization. On families which
@@ -344,6 +354,9 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
check_null_seg_clears_base(c);
+
+ /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index be4045628fd3..a927a8fc9624 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -314,19 +314,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_PGE);
}
- if (c->cpuid_level >= 0x00000001) {
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- /*
- * If HTT (EDX[28]) is set EBX[16:23] contain the number of
- * apicids which are reserved per package. Store the resulting
- * shift value for the package management code.
- */
- if (edx & (1U << 28))
- c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
- }
-
check_memory_type_self_snoop_errata(c);
/*
@@ -1016,7 +1003,6 @@ static struct ctl_table sld_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- {}
};
static int __init sld_mitigate_sysctl_init(void)
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index e4c3ba91321c..f18d35fe27a9 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -237,4 +237,4 @@ err_out_online:
cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE);
return ret;
}
-subsys_initcall(intel_epb_init);
+late_initcall(intel_epb_init);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index c267f43de39e..2b46eb0fdf3a 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -87,42 +87,40 @@ struct smca_bank {
static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
-struct smca_bank_name {
- const char *name; /* Short name for sysfs */
- const char *long_name; /* Long name for pretty-printing */
-};
-
-static struct smca_bank_name smca_names[] = {
- [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" },
- [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
- [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
- [SMCA_DE] = { "decode_unit", "Decode Unit" },
- [SMCA_RESERVED] = { "reserved", "Reserved" },
- [SMCA_EX] = { "execution_unit", "Execution Unit" },
- [SMCA_FP] = { "floating_point", "Floating Point Unit" },
- [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
- [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
- [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+static const char * const smca_names[] = {
+ [SMCA_LS ... SMCA_LS_V2] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [SMCA_RESERVED] = "reserved",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+ [SMCA_CS ... SMCA_CS_V2] = "coherent_slave",
+ [SMCA_PIE] = "pie",
/* UMC v2 is separate because both of them can exist in a single system. */
- [SMCA_UMC] = { "umc", "Unified Memory Controller" },
- [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" },
- [SMCA_PB] = { "param_block", "Parameter Block" },
- [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
- [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
- [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
- [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" },
- [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
- [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
- [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
- [SMCA_NBIF] = { "nbif", "NBIF Unit" },
- [SMCA_SHUB] = { "shub", "System Hub Unit" },
- [SMCA_SATA] = { "sata", "SATA Unit" },
- [SMCA_USB] = { "usb", "USB Unit" },
- [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
- [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
- [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
- [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" },
+ [SMCA_UMC] = "umc",
+ [SMCA_UMC_V2] = "umc_v2",
+ [SMCA_MA_LLC] = "ma_llc",
+ [SMCA_PB] = "param_block",
+ [SMCA_PSP ... SMCA_PSP_V2] = "psp",
+ [SMCA_SMU ... SMCA_SMU_V2] = "smu",
+ [SMCA_MP5] = "mp5",
+ [SMCA_MPDMA] = "mpdma",
+ [SMCA_NBIO] = "nbio",
+ [SMCA_PCIE ... SMCA_PCIE_V2] = "pcie",
+ [SMCA_XGMI_PCS] = "xgmi_pcs",
+ [SMCA_NBIF] = "nbif",
+ [SMCA_SHUB] = "shub",
+ [SMCA_SATA] = "sata",
+ [SMCA_USB] = "usb",
+ [SMCA_USR_DP] = "usr_dp",
+ [SMCA_USR_CP] = "usr_cp",
+ [SMCA_GMI_PCS] = "gmi_pcs",
+ [SMCA_XGMI_PHY] = "xgmi_phy",
+ [SMCA_WAFL_PHY] = "wafl_phy",
+ [SMCA_GMI_PHY] = "gmi_phy",
};
static const char *smca_get_name(enum smca_bank_types t)
@@ -130,17 +128,8 @@ static const char *smca_get_name(enum smca_bank_types t)
if (t >= N_SMCA_BANK_TYPES)
return NULL;
- return smca_names[t].name;
-}
-
-const char *smca_get_long_name(enum smca_bank_types t)
-{
- if (t >= N_SMCA_BANK_TYPES)
- return NULL;
-
- return smca_names[t].long_name;
+ return smca_names[t];
}
-EXPORT_SYMBOL_GPL(smca_get_long_name);
enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
{
@@ -178,6 +167,7 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
{ SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
{ SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
/* Unified Memory Controller MCA type */
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
@@ -212,6 +202,8 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
{ SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
{ SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
{ SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
@@ -713,17 +705,75 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
deferred_error_interrupt_enable(c);
}
-bool amd_mce_is_memory_error(struct mce *m)
+/*
+ * DRAM ECC errors are reported in the Northbridge (bank 4) with
+ * Extended Error Code 8.
+ */
+static bool legacy_mce_is_memory_error(struct mce *m)
+{
+ return m->bank == 4 && XEC(m->status, 0x1f) == 8;
+}
+
+/*
+ * DRAM ECC errors are reported in Unified Memory Controllers with
+ * Extended Error Code 0.
+ */
+static bool smca_mce_is_memory_error(struct mce *m)
{
enum smca_bank_types bank_type;
- /* ErrCodeExt[20:16] */
- u8 xec = (m->status >> 16) & 0x1f;
+
+ if (XEC(m->status, 0x3f))
+ return false;
bank_type = smca_get_bank_type(m->extcpu, m->bank);
+
+ return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2;
+}
+
+bool amd_mce_is_memory_error(struct mce *m)
+{
if (mce_flags.smca)
- return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0;
+ return smca_mce_is_memory_error(m);
+ else
+ return legacy_mce_is_memory_error(m);
+}
+
+/*
+ * AMD systems do not have an explicit indicator that the value in MCA_ADDR is
+ * a system physical address. Therefore, individual cases need to be detected.
+ * Future cases and checks will be added as needed.
+ *
+ * 1) General case
+ * a) Assume address is not usable.
+ * 2) Poison errors
+ * a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy
+ * northbridge (bank 4).
+ * b) Refers to poison consumption in the core. Does not include "no action",
+ * "action optional", or "deferred" error severities.
+ * c) Will include a usable address so that immediate action can be taken.
+ * 3) Northbridge DRAM ECC errors
+ * a) Reported in legacy bank 4 with extended error code (XEC) 8.
+ * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
+ * this bit should not be checked.
+ *
+ * NOTE: SMCA UMC memory errors fall into case #1.
+ */
+bool amd_mce_usable_address(struct mce *m)
+{
+ /* Check special northbridge case 3) first. */
+ if (!mce_flags.smca) {
+ if (legacy_mce_is_memory_error(m))
+ return true;
+ else if (m->bank == 4)
+ return false;
+ }
+
+ /* Check poison bit for all other bank types. */
+ if (m->status & MCI_STATUS_POISON)
+ return true;
- return m->bank == 4 && xec == 0x8;
+ /* Assume address is not usable for all others. */
+ return false;
}
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 8ed341714686..7f7309ff67d0 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -103,9 +103,9 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
m.socketid = -1;
for_each_possible_cpu(cpu) {
- if (cpu_data(cpu).initial_apicid == lapic_id) {
+ if (cpu_data(cpu).topo.initial_apicid == lapic_id) {
m.extcpu = cpu;
- m.socketid = cpu_data(m.extcpu).phys_proc_id;
+ m.socketid = cpu_data(m.extcpu).topo.pkg_id;
break;
}
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 6f35f724cc14..bc39252bc54f 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -44,6 +44,7 @@
#include <linux/sync_core.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>
+#include <linux/kexec.h>
#include <asm/intel-family.h>
#include <asm/processor.h>
@@ -52,6 +53,7 @@
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/reboot.h>
+#include <asm/tdx.h>
#include "internal.h"
@@ -123,8 +125,8 @@ void mce_setup(struct mce *m)
m->time = __ktime_get_real_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
m->cpuid = cpuid_eax(1);
- m->socketid = cpu_data(m->extcpu).phys_proc_id;
- m->apicid = cpu_data(m->extcpu).initial_apicid;
+ m->socketid = cpu_data(m->extcpu).topo.pkg_id;
+ m->apicid = cpu_data(m->extcpu).topo.initial_apicid;
m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
m->ppin = cpu_data(m->extcpu).ppin;
m->microcode = boot_cpu_data.microcode;
@@ -228,11 +230,20 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
+static const char *mce_dump_aux_info(struct mce *m)
+{
+ if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return tdx_dump_mce_info(m);
+
+ return NULL;
+}
+
static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
{
struct llist_node *pending;
struct mce_evt_llist *l;
int apei_err = 0;
+ const char *memmsg;
/*
* Allow instrumentation around external facilities usage. Not that it
@@ -283,9 +294,29 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
}
if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
+
+ memmsg = mce_dump_aux_info(final);
+ if (memmsg)
+ pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
+
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
+
+ /*
+ * Kdump skips the poisoned page in order to avoid
+ * touching the error bits again. Poison the page even
+ * if the error is fatal and the machine is about to
+ * panic.
+ */
+ if (kexec_crash_loaded()) {
+ if (final && (final->status & MCI_STATUS_ADDRV)) {
+ struct page *p;
+ p = pfn_to_online_page(final->addr >> PAGE_SHIFT);
+ if (p)
+ SetPageHWPoison(p);
+ }
+ }
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
@@ -453,32 +484,22 @@ static void mce_irq_work_cb(struct irq_work *entry)
mce_schedule_work();
}
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granularity for now.
- */
-int mce_usable_address(struct mce *m)
+bool mce_usable_address(struct mce *m)
{
if (!(m->status & MCI_STATUS_ADDRV))
- return 0;
-
- /* Checks after this one are Intel/Zhaoxin-specific: */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
- boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
- return 1;
-
- if (!(m->status & MCI_STATUS_MISCV))
- return 0;
+ return false;
- if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
- return 0;
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ return amd_mce_usable_address(m);
- if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
- return 0;
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
+ return intel_mce_usable_address(m);
- return 1;
+ default:
+ return true;
+ }
}
EXPORT_SYMBOL_GPL(mce_usable_address);
@@ -680,6 +701,16 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
barrier();
m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ /*
+ * Update storm tracking here, before checking for the
+ * MCI_STATUS_VAL bit. Valid corrected errors count
+ * towards declaring, or maintaining, storm status. No
+ * error in a bank counts towards avoiding, or ending,
+ * storm status.
+ */
+ if (!mca_cfg.cmci_disabled)
+ mce_track_storm(&m);
+
/* If this entry is not valid, ignore it */
if (!(m.status & MCI_STATUS_VAL))
continue;
@@ -1611,13 +1642,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static unsigned long mce_adjust_timer_default(unsigned long interval)
-{
- return interval;
-}
-
-static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-
static void __start_timer(struct timer_list *t, unsigned long interval)
{
unsigned long when = jiffies + interval;
@@ -1647,15 +1671,9 @@ static void mce_timer_fn(struct timer_list *t)
iv = __this_cpu_read(mce_next_interval);
- if (mce_available(this_cpu_ptr(&cpu_info))) {
+ if (mce_available(this_cpu_ptr(&cpu_info)))
mc_poll_banks();
- if (mce_intel_cmci_poll()) {
- iv = mce_adjust_timer(iv);
- goto done;
- }
- }
-
/*
* Alert userspace if needed. If we logged an MCE, reduce the polling
* interval, otherwise increase the polling interval.
@@ -1665,23 +1683,29 @@ static void mce_timer_fn(struct timer_list *t)
else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
-done:
- __this_cpu_write(mce_next_interval, iv);
- __start_timer(t, iv);
+ if (mce_get_storm_mode()) {
+ __start_timer(t, HZ);
+ } else {
+ __this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
}
/*
- * Ensure that the timer is firing in @interval from now.
+ * When a storm starts on any bank on this CPU, switch to polling
+ * once per second. When the storm ends, revert to the default
+ * polling interval.
*/
-void mce_timer_kick(unsigned long interval)
+void mce_timer_kick(bool storm)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned long iv = __this_cpu_read(mce_next_interval);
- __start_timer(t, interval);
+ mce_set_storm_mode(storm);
- if (interval < iv)
- __this_cpu_write(mce_next_interval, interval);
+ if (storm)
+ __start_timer(t, HZ);
+ else
+ __this_cpu_write(mce_next_interval, check_interval * HZ);
}
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -2005,7 +2029,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
intel_init_cmci();
intel_init_lmce();
- mce_adjust_timer = cmci_intel_adjust_timer;
}
static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
@@ -2018,7 +2041,6 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
- mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD: {
@@ -2578,9 +2600,6 @@ static int mce_device_create(unsigned int cpu)
int err;
int i, j;
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
dev = per_cpu(mce_device, cpu);
if (dev)
return 0;
@@ -2675,8 +2694,6 @@ static void mce_reenable_cpu(void)
static int mce_cpu_dead(unsigned int cpu)
{
- mce_intel_hcpu_update(cpu);
-
/* intentionally ignoring frozen here */
if (!cpuhp_tasks_frozen)
cmci_rediscover();
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 4d8d4bcf915d..72f0695c3dc1 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -746,6 +746,7 @@ static void check_hw_inj_possible(void)
wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status);
rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status);
+ wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0);
if (!status) {
hw_injection_possible = false;
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index f5323551c1a9..399b62e223d2 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -42,15 +42,6 @@
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
- * CMCI storm detection backoff counter
- *
- * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
- * encountered an error. If not, we decrement it by one. We signal the end of
- * the CMCI storm when it reaches 0.
- */
-static DEFINE_PER_CPU(int, cmci_backoff_cnt);
-
-/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
@@ -63,22 +54,26 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
*/
static DEFINE_SPINLOCK(cmci_poll_lock);
+/* Linux non-storm CMCI threshold (may be overridden by BIOS) */
#define CMCI_THRESHOLD 1
-#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (HZ)
-#define CMCI_STORM_THRESHOLD 15
-
-static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
-enum {
- CMCI_STORM_NONE,
- CMCI_STORM_ACTIVE,
- CMCI_STORM_SUBSIDED,
-};
+/*
+ * MCi_CTL2 threshold for each bank when there is no storm.
+ * Default value for each bank may have been set by BIOS.
+ */
+static u16 cmci_threshold[MAX_NR_BANKS];
-static atomic_t cmci_storm_on_cpus;
+/*
+ * High threshold to limit CMCI rate during storms. Max supported is
+ * 0x7FFF. Use this slightly smaller value so it has a distinctive
+ * signature when some asks "Why am I not seeing all corrected errors?"
+ * A high threshold is used instead of just disabling CMCI for a
+ * bank because both corrected and uncorrected errors may be logged
+ * in the same bank and signalled with CMCI. The threshold only applies
+ * to corrected errors, so keeping CMCI enabled means that uncorrected
+ * errors will still be processed in a timely fashion.
+ */
+#define CMCI_STORM_THRESHOLD 32749
static int cmci_supported(int *banks)
{
@@ -134,204 +129,166 @@ static bool lmce_supported(void)
return tmp & FEAT_CTL_LMCE_ENABLED;
}
-bool mce_intel_cmci_poll(void)
+/*
+ * Set a new CMCI threshold value. Preserve the state of the
+ * MCI_CTL2_CMCI_EN bit in case this happens during a
+ * cmci_rediscover() operation.
+ */
+static void cmci_set_threshold(int bank, int thresh)
{
- if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
- return false;
-
- /*
- * Reset the counter if we've logged an error in the last poll
- * during the storm.
- */
- if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
- else
- this_cpu_dec(cmci_backoff_cnt);
+ unsigned long flags;
+ u64 val;
- return true;
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
-void mce_intel_hcpu_update(unsigned long cpu)
+void mce_intel_handle_storm(int bank, bool on)
{
- if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
- atomic_dec(&cmci_storm_on_cpus);
+ if (on)
+ cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+ else
+ cmci_set_threshold(bank, cmci_threshold[bank]);
+}
- per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
}
-static void cmci_toggle_interrupt_mode(bool on)
+/*
+ * Check all the reasons why current CPU cannot claim
+ * ownership of a bank.
+ * 1: CPU already owns this bank
+ * 2: BIOS owns this bank
+ * 3: Some other CPU owns this bank
+ */
+static bool cmci_skip_bank(int bank, u64 *val)
{
- unsigned long flags, *owned;
- int bank;
- u64 val;
+ unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- owned = this_cpu_ptr(mce_banks_owned);
- for_each_set_bit(bank, owned, MAX_NR_BANKS) {
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ if (test_bit(bank, owned))
+ return true;
- if (on)
- val |= MCI_CTL2_CMCI_EN;
- else
- val &= ~MCI_CTL2_CMCI_EN;
+ /* Skip banks in firmware first mode */
+ if (test_bit(bank, mce_banks_ce_disabled))
+ return true;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), *val);
-unsigned long cmci_intel_adjust_timer(unsigned long interval)
-{
- if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
- (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
- mce_notify_irq();
- return CMCI_STORM_INTERVAL;
+ /* Already owned by someone else? */
+ if (*val & MCI_CTL2_CMCI_EN) {
+ clear_bit(bank, owned);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ return true;
}
- switch (__this_cpu_read(cmci_storm_state)) {
- case CMCI_STORM_ACTIVE:
-
- /*
- * We switch back to interrupt mode once the poll timer has
- * silenced itself. That means no events recorded and the timer
- * interval is back to our poll interval.
- */
- __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- if (!atomic_sub_return(1, &cmci_storm_on_cpus))
- pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+ return false;
+}
- fallthrough;
+/*
+ * Decide which CMCI interrupt threshold to use:
+ * 1: If this bank is in storm mode from whichever CPU was
+ * the previous owner, stay in storm mode.
+ * 2: If ignoring any threshold set by BIOS, set Linux default
+ * 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero).
+ */
+static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh)
+{
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ return val;
- case CMCI_STORM_SUBSIDED:
+ if (!mca_cfg.bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
/*
- * We wait for all CPUs to go back to SUBSIDED state. When that
- * happens we switch back to interrupt mode.
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
*/
- if (!atomic_read(&cmci_storm_on_cpus)) {
- __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
- cmci_toggle_interrupt_mode(true);
- cmci_recheck();
- }
- return CMCI_POLL_INTERVAL;
- default:
-
- /* We have shiny weather. Let the poll do whatever it thinks. */
- return interval;
+ *bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
}
+
+ return val;
}
-static bool cmci_storm_detect(void)
+/*
+ * Try to claim ownership of a bank.
+ */
+static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh)
{
- unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
- unsigned long ts = __this_cpu_read(cmci_time_stamp);
- unsigned long now = jiffies;
- int r;
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
- if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
- return true;
+ val |= MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
- if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
- cnt++;
- } else {
- cnt = 1;
- __this_cpu_write(cmci_time_stamp, now);
+ /* If the enable bit did not stick, this bank should be polled. */
+ if (!(val & MCI_CTL2_CMCI_EN)) {
+ WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks)));
+ storm->banks[bank].poll_only = true;
+ return;
}
- __this_cpu_write(cmci_storm_cnt, cnt);
-
- if (cnt <= CMCI_STORM_THRESHOLD)
- return false;
- cmci_toggle_interrupt_mode(false);
- __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
- r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_STORM_INTERVAL);
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+ /* This CPU successfully set the enable bit. */
+ set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned));
- if (r == 1)
- pr_notice("CMCI storm detected: switching to poll mode\n");
- return true;
-}
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) {
+ pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank);
+ mce_inherit_storm(bank);
+ cmci_storm_begin(bank);
+ } else {
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ }
-/*
- * The interrupt handler. This is called on every event.
- * Just call the poller directly to log any events.
- * This could in theory increase the threshold under high load,
- * but doesn't for now.
- */
-static void intel_threshold_interrupt(void)
-{
- if (cmci_storm_detect())
- return;
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ *bios_wrong_thresh = 1;
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ /* Save default threshold for each bank */
+ if (cmci_threshold[bank] == 0)
+ cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK;
}
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
- * banks.
+ * banks. Called during initial bootstrap, and also for hotplug CPU operations
+ * to rediscover/reassign machine check banks.
*/
static void cmci_discover(int banks)
{
- unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
+ int bios_wrong_thresh = 0;
unsigned long flags;
int i;
- int bios_wrong_thresh = 0;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
- if (test_bit(i, owned))
- continue;
-
- /* Skip banks in firmware first mode */
- if (test_bit(i, mce_banks_ce_disabled))
+ if (cmci_skip_bank(i, &val))
continue;
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Already owned by someone else? */
- if (val & MCI_CTL2_CMCI_EN) {
- clear_bit(i, owned);
- __clear_bit(i, this_cpu_ptr(mce_poll_banks));
- continue;
- }
-
- if (!mca_cfg.bios_cmci_threshold) {
- val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
- val |= CMCI_THRESHOLD;
- } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
- /*
- * If bios_cmci_threshold boot option was specified
- * but the threshold is zero, we'll try to initialize
- * it to 1.
- */
- bios_zero_thresh = 1;
- val |= CMCI_THRESHOLD;
- }
-
- val |= MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Did the enable bit stick? -- the bank supports CMCI */
- if (val & MCI_CTL2_CMCI_EN) {
- set_bit(i, owned);
- __clear_bit(i, this_cpu_ptr(mce_poll_banks));
- /*
- * We are able to set thresholds for some banks that
- * had a threshold of 0. This means the BIOS has not
- * set the thresholds properly or does not work with
- * this boot option. Note down now and report later.
- */
- if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
- (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
- bios_wrong_thresh = 1;
- } else {
- WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
- }
+ val = cmci_pick_threshold(val, &bios_zero_thresh);
+ cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh);
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
@@ -370,6 +327,9 @@ static void __cmci_disable_bank(int bank)
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
__clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ cmci_storm_end(bank);
}
/*
@@ -536,3 +496,23 @@ bool intel_filter_mce(struct mce *m)
return false;
}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granularity for now.
+ */
+bool intel_mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV))
+ return false;
+
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+ return false;
+
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index bcf1b3c66c9c..01f8f03969e6 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -41,26 +41,80 @@ struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
-unsigned long cmci_intel_adjust_timer(unsigned long interval);
-bool mce_intel_cmci_poll(void);
-void mce_intel_hcpu_update(unsigned long cpu);
+void mce_intel_handle_storm(int bank, bool on);
void cmci_disable_bank(int bank);
void intel_init_cmci(void);
void intel_init_lmce(void);
void intel_clear_lmce(void);
bool intel_filter_mce(struct mce *m);
+bool intel_mce_usable_address(struct mce *m);
#else
-# define cmci_intel_adjust_timer mce_adjust_timer_default
-static inline bool mce_intel_cmci_poll(void) { return false; }
-static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void mce_intel_handle_storm(int bank, bool on) { }
static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
static inline void intel_clear_lmce(void) { }
static inline bool intel_filter_mce(struct mce *m) { return false; }
+static inline bool intel_mce_usable_address(struct mce *m) { return false; }
#endif
-void mce_timer_kick(unsigned long interval);
+void mce_timer_kick(bool storm);
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+void cmci_storm_begin(unsigned int bank);
+void cmci_storm_end(unsigned int bank);
+void mce_track_storm(struct mce *mce);
+void mce_inherit_storm(unsigned int bank);
+bool mce_get_storm_mode(void);
+void mce_set_storm_mode(bool storm);
+#else
+static inline void cmci_storm_begin(unsigned int bank) {}
+static inline void cmci_storm_end(unsigned int bank) {}
+static inline void mce_track_storm(struct mce *mce) {}
+static inline void mce_inherit_storm(unsigned int bank) {}
+static inline bool mce_get_storm_mode(void) { return false; }
+static inline void mce_set_storm_mode(bool storm) {}
+#endif
+
+/*
+ * history: Bitmask tracking errors occurrence. Each set bit
+ * represents an error seen.
+ *
+ * timestamp: Last time (in jiffies) that the bank was polled.
+ * in_storm_mode: Is this bank in storm mode?
+ * poll_only: Bank does not support CMCI, skip storm tracking.
+ */
+struct storm_bank {
+ u64 history;
+ u64 timestamp;
+ bool in_storm_mode;
+ bool poll_only;
+};
+
+#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE)
+
+/* How many errors within the history buffer mark the start of a storm. */
+#define STORM_BEGIN_THRESHOLD 5
+
+/*
+ * How many polls of machine check bank without an error before declaring
+ * the storm is over. Since it is tracked by the bitmasks in the history
+ * field of struct storm_bank the mask is 30 bits [0 ... 29].
+ */
+#define STORM_END_POLL_THRESHOLD 29
+
+/*
+ * banks: per-cpu, per-bank details
+ * stormy_bank_count: count of MC banks in storm state
+ * poll_mode: CPU is in poll mode
+ */
+struct mca_storm_desc {
+ struct storm_bank banks[MAX_NR_BANKS];
+ u8 stormy_bank_count;
+ bool poll_mode;
+};
+
+DECLARE_PER_CPU(struct mca_storm_desc, storm_desc);
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
@@ -210,6 +264,7 @@ extern bool filter_mce(struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
+bool amd_mce_usable_address(struct mce *m);
/*
* If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
@@ -237,6 +292,7 @@ static __always_inline void smca_extract_err_addr(struct mce *m)
#else
static inline bool amd_filter_mce(struct mce *m) { return false; }
+static inline bool amd_mce_usable_address(struct mce *m) { return false; }
static inline void smca_extract_err_addr(struct mce *m) { }
#endif
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index ef4e7bb5fd88..89e31e1e5c9c 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -29,3 +29,118 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
apic_eoi();
}
+
+DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
+
+void mce_inherit_storm(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ /*
+ * Previous CPU owning this bank had put it into storm mode,
+ * but the precise history of that storm is unknown. Assume
+ * the worst (all recent polls of the bank found a valid error
+ * logged). This will avoid the new owner prematurely declaring
+ * the storm has ended.
+ */
+ storm->banks[bank].history = ~0ull;
+ storm->banks[bank].timestamp = jiffies;
+}
+
+bool mce_get_storm_mode(void)
+{
+ return __this_cpu_read(storm_desc.poll_mode);
+}
+
+void mce_set_storm_mode(bool storm)
+{
+ __this_cpu_write(storm_desc.poll_mode, storm);
+}
+
+static void mce_handle_storm(unsigned int bank, bool on)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_handle_storm(bank, on);
+ break;
+ }
+}
+
+void cmci_storm_begin(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __set_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].in_storm_mode = true;
+
+ /*
+ * If this is the first bank on this CPU to enter storm mode
+ * start polling.
+ */
+ if (++storm->stormy_bank_count == 1)
+ mce_timer_kick(true);
+}
+
+void cmci_storm_end(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].history = 0;
+ storm->banks[bank].in_storm_mode = false;
+
+ /* If no banks left in storm mode, stop polling. */
+ if (!this_cpu_dec_return(storm_desc.stormy_bank_count))
+ mce_timer_kick(false);
+}
+
+void mce_track_storm(struct mce *mce)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+ unsigned long now = jiffies, delta;
+ unsigned int shift = 1;
+ u64 history = 0;
+
+ /* No tracking needed for banks that do not support CMCI */
+ if (storm->banks[mce->bank].poll_only)
+ return;
+
+ /*
+ * When a bank is in storm mode it is polled once per second and
+ * the history mask will record about the last minute of poll results.
+ * If it is not in storm mode, then the bank is only checked when
+ * there is a CMCI interrupt. Check how long it has been since
+ * this bank was last checked, and adjust the amount of "shift"
+ * to apply to history.
+ */
+ if (!storm->banks[mce->bank].in_storm_mode) {
+ delta = now - storm->banks[mce->bank].timestamp;
+ shift = (delta + HZ) / HZ;
+ }
+
+ /* If it has been a long time since the last poll, clear history. */
+ if (shift < NUM_HISTORY_BITS)
+ history = storm->banks[mce->bank].history << shift;
+
+ storm->banks[mce->bank].timestamp = now;
+
+ /* History keeps track of corrected errors. VAL=1 && UC=0 */
+ if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
+ history |= 1;
+
+ storm->banks[mce->bank].history = history;
+
+ if (storm->banks[mce->bank].in_storm_mode) {
+ if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, false);
+ cmci_storm_end(mce->bank);
+ } else {
+ if (hweight64(history) < STORM_BEGIN_THRESHOLD)
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, true);
+ cmci_storm_begin(mce->bank);
+ }
+}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index bbd1dc38ea03..13b45b9c806d 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -37,6 +37,16 @@
#include "internal.h"
+struct ucode_patch {
+ struct list_head plist;
+ void *data;
+ unsigned int size;
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+static LIST_HEAD(microcode_cache);
+
#define UCODE_MAGIC 0x00414d44
#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
#define UCODE_UCODE_TYPE 0x00000001
@@ -94,8 +104,6 @@ struct cont_desc {
size_t size;
};
-static u32 ucode_new_rev;
-
/*
* Microcode patch container file is prepended to the initrd in cpio
* format. See Documentation/arch/x86/microcode.rst
@@ -121,24 +129,20 @@ static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
/*
* Check whether there is a valid microcode container file at the beginning
- * of @buf of size @buf_size. Set @early to use this function in the early path.
+ * of @buf of size @buf_size.
*/
-static bool verify_container(const u8 *buf, size_t buf_size, bool early)
+static bool verify_container(const u8 *buf, size_t buf_size)
{
u32 cont_magic;
if (buf_size <= CONTAINER_HDR_SZ) {
- if (!early)
- pr_debug("Truncated microcode container header.\n");
-
+ pr_debug("Truncated microcode container header.\n");
return false;
}
cont_magic = *(const u32 *)buf;
if (cont_magic != UCODE_MAGIC) {
- if (!early)
- pr_debug("Invalid magic value (0x%08x).\n", cont_magic);
-
+ pr_debug("Invalid magic value (0x%08x).\n", cont_magic);
return false;
}
@@ -147,23 +151,20 @@ static bool verify_container(const u8 *buf, size_t buf_size, bool early)
/*
* Check whether there is a valid, non-truncated CPU equivalence table at the
- * beginning of @buf of size @buf_size. Set @early to use this function in the
- * early path.
+ * beginning of @buf of size @buf_size.
*/
-static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early)
+static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
{
const u32 *hdr = (const u32 *)buf;
u32 cont_type, equiv_tbl_len;
- if (!verify_container(buf, buf_size, early))
+ if (!verify_container(buf, buf_size))
return false;
cont_type = hdr[1];
if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) {
- if (!early)
- pr_debug("Wrong microcode container equivalence table type: %u.\n",
- cont_type);
-
+ pr_debug("Wrong microcode container equivalence table type: %u.\n",
+ cont_type);
return false;
}
@@ -172,9 +173,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early)
equiv_tbl_len = hdr[2];
if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) ||
buf_size < equiv_tbl_len) {
- if (!early)
- pr_debug("Truncated equivalence table.\n");
-
+ pr_debug("Truncated equivalence table.\n");
return false;
}
@@ -183,22 +182,19 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early)
/*
* Check whether there is a valid, non-truncated microcode patch section at the
- * beginning of @buf of size @buf_size. Set @early to use this function in the
- * early path.
+ * beginning of @buf of size @buf_size.
*
* On success, @sh_psize returns the patch size according to the section header,
* to the caller.
*/
static bool
-__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize, bool early)
+__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
{
u32 p_type, p_size;
const u32 *hdr;
if (buf_size < SECTION_HDR_SIZE) {
- if (!early)
- pr_debug("Truncated patch section.\n");
-
+ pr_debug("Truncated patch section.\n");
return false;
}
@@ -207,17 +203,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize, bool early
p_size = hdr[1];
if (p_type != UCODE_UCODE_TYPE) {
- if (!early)
- pr_debug("Invalid type field (0x%x) in container file section header.\n",
- p_type);
-
+ pr_debug("Invalid type field (0x%x) in container file section header.\n",
+ p_type);
return false;
}
if (p_size < sizeof(struct microcode_header_amd)) {
- if (!early)
- pr_debug("Patch of size %u too short.\n", p_size);
-
+ pr_debug("Patch of size %u too short.\n", p_size);
return false;
}
@@ -269,7 +261,7 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
* 0: success
*/
static int
-verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool early)
+verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
{
struct microcode_header_amd *mc_hdr;
unsigned int ret;
@@ -277,7 +269,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea
u16 proc_id;
u8 patch_fam;
- if (!__verify_patch_section(buf, buf_size, &sh_psize, early))
+ if (!__verify_patch_section(buf, buf_size, &sh_psize))
return -1;
/*
@@ -292,16 +284,13 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea
* size sh_psize, as the section claims.
*/
if (buf_size < sh_psize) {
- if (!early)
- pr_debug("Patch of size %u truncated.\n", sh_psize);
-
+ pr_debug("Patch of size %u truncated.\n", sh_psize);
return -1;
}
ret = __verify_patch_size(family, sh_psize, buf_size);
if (!ret) {
- if (!early)
- pr_debug("Per-family patch size mismatch.\n");
+ pr_debug("Per-family patch size mismatch.\n");
return -1;
}
@@ -309,8 +298,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea
mc_hdr = (struct microcode_header_amd *)(buf + SECTION_HDR_SIZE);
if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
- if (!early)
- pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id);
+ pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id);
return -1;
}
@@ -337,7 +325,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
u16 eq_id;
u8 *buf;
- if (!verify_equivalence_table(ucode, size, true))
+ if (!verify_equivalence_table(ucode, size))
return 0;
buf = ucode;
@@ -364,7 +352,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
u32 patch_size;
int ret;
- ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size, true);
+ ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size);
if (ret < 0) {
/*
* Patch verification failed, skip to the next container, if
@@ -452,19 +440,12 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
*
* Returns true if container found (sets @desc), false otherwise.
*/
-static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size)
+static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size)
{
struct cont_desc desc = { 0 };
struct microcode_amd *mc;
- u32 rev, dummy, *new_rev;
bool ret = false;
-#ifdef CONFIG_X86_32
- new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
-#else
- new_rev = &ucode_new_rev;
-#endif
-
desc.cpuid_1_eax = cpuid_1_eax;
scan_containers(ucode, size, &desc);
@@ -473,22 +454,15 @@ static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size)
if (!mc)
return ret;
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
/*
* Allow application of the same revision to pick up SMT-specific
* changes even if the revision of the other SMT thread is already
* up-to-date.
*/
- if (rev > mc->hdr.patch_id)
+ if (old_rev > mc->hdr.patch_id)
return ret;
- if (!__apply_microcode_amd(mc)) {
- *new_rev = mc->hdr.patch_id;
- ret = true;
- }
-
- return ret;
+ return !__apply_microcode_amd(mc);
}
static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
@@ -501,7 +475,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
if (family >= 0x15)
snprintf(fw_name, sizeof(fw_name),
- "amd-ucode/microcode_amd_fam%.2xh.bin", family);
+ "amd-ucode/microcode_amd_fam%02hhxh.bin", family);
if (firmware_request_builtin(&fw, fw_name)) {
cp->size = fw.size;
@@ -512,57 +486,48 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
return false;
}
-static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret)
{
- struct ucode_cpu_info *uci;
struct cpio_data cp;
- const char *path;
- bool use_pa;
-
- if (IS_ENABLED(CONFIG_X86_32)) {
- uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info);
- path = (const char *)__pa_nodebug(ucode_path);
- use_pa = true;
- } else {
- uci = ucode_cpu_info;
- path = ucode_path;
- use_pa = false;
- }
if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
- cp = find_microcode_in_initrd(path, use_pa);
-
- /* Needed in load_microcode_amd() */
- uci->cpu_sig.sig = cpuid_1_eax;
+ cp = find_microcode_in_initrd(ucode_path);
*ret = cp;
}
-static void apply_ucode_from_containers(unsigned int cpuid_1_eax)
+void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax)
{
struct cpio_data cp = { };
+ u32 dummy;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy);
+
+ /* Needed in load_microcode_amd() */
+ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
find_blobs_in_containers(cpuid_1_eax, &cp);
if (!(cp.data && cp.size))
return;
- early_apply_microcode(cpuid_1_eax, cp.data, cp.size);
-}
-
-void load_ucode_amd_early(unsigned int cpuid_1_eax)
-{
- return apply_ucode_from_containers(cpuid_1_eax);
+ if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size))
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy);
}
static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
-int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
+static int __init save_microcode_in_initrd(void)
{
+ unsigned int cpuid_1_eax = native_cpuid_eax(1);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
struct cont_desc desc = { 0 };
enum ucode_state ret;
struct cpio_data cp;
- cp = find_microcode_in_initrd(ucode_path, false);
+ if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ return 0;
+
+ find_blobs_in_containers(cpuid_1_eax, &cp);
if (!(cp.data && cp.size))
return -EINVAL;
@@ -578,6 +543,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
return 0;
}
+early_initcall(save_microcode_in_initrd);
/*
* a small, trivial cache of per-family ucode patches
@@ -631,7 +597,6 @@ static struct ucode_patch *find_patch(unsigned int cpu)
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
u16 equiv_id;
-
equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
if (!equiv_id)
return NULL;
@@ -654,10 +619,8 @@ void reload_ucode_amd(unsigned int cpu)
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
if (rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc)) {
- ucode_new_rev = mc->hdr.patch_id;
- pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
- }
+ if (!__apply_microcode_amd(mc))
+ pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id);
}
}
@@ -678,8 +641,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
if (p && (p->patch_id == csig->rev))
uci->mc = p->data;
- pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
-
return 0;
}
@@ -720,8 +681,6 @@ static enum ucode_state apply_microcode_amd(int cpu)
rev = mc_amd->hdr.patch_id;
ret = UCODE_UPDATED;
- pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
-
out:
uci->cpu_sig.rev = rev;
c->microcode = rev;
@@ -733,12 +692,20 @@ out:
return ret;
}
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
+{
+ unsigned int cpu = smp_processor_id();
+
+ ucode_cpu_info[cpu].cpu_sig.sig = cpuid_1_eax;
+ apply_microcode_amd(cpu);
+}
+
static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
{
u32 equiv_tbl_len;
const u32 *hdr;
- if (!verify_equivalence_table(buf, buf_size, false))
+ if (!verify_equivalence_table(buf, buf_size))
return 0;
hdr = (const u32 *)buf;
@@ -784,7 +751,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
u16 proc_id;
int ret;
- ret = verify_patch(family, fw, leftover, patch_size, false);
+ ret = verify_patch(family, fw, leftover, patch_size);
if (ret)
return ret;
@@ -909,6 +876,9 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
+ if (force_minrev)
+ return UCODE_NFOUND;
+
if (c->x86 >= 0x15)
snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
@@ -918,7 +888,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
}
ret = UCODE_ERROR;
- if (!verify_container(fw->data, fw->size, false))
+ if (!verify_container(fw->data, fw->size))
goto fw_release;
ret = load_microcode_amd(c->x86, fw->data, fw->size);
@@ -938,10 +908,11 @@ static void microcode_fini_cpu_amd(int cpu)
}
static struct microcode_ops microcode_amd_ops = {
- .request_microcode_fw = request_microcode_amd,
- .collect_cpu_info = collect_cpu_info_amd,
- .apply_microcode = apply_microcode_amd,
- .microcode_fini_cpu = microcode_fini_cpu_amd,
+ .request_microcode_fw = request_microcode_amd,
+ .collect_cpu_info = collect_cpu_info_amd,
+ .apply_microcode = apply_microcode_amd,
+ .microcode_fini_cpu = microcode_fini_cpu_amd,
+ .nmi_safe = true,
};
struct microcode_ops * __init init_amd_microcode(void)
@@ -952,11 +923,6 @@ struct microcode_ops * __init init_amd_microcode(void)
pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
return NULL;
}
-
- if (ucode_new_rev)
- pr_info_once("microcode updated early to new patch_level=0x%08x\n",
- ucode_new_rev);
-
return &microcode_amd_ops;
}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 6cc7a2c181da..232026a239a6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -23,6 +23,7 @@
#include <linux/miscdevice.h>
#include <linux/capability.h>
#include <linux/firmware.h>
+#include <linux/cpumask.h>
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/mutex.h>
@@ -31,6 +32,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
+#include <asm/apic.h>
#include <asm/cpu_device_id.h>
#include <asm/perf_event.h>
#include <asm/processor.h>
@@ -39,14 +41,11 @@
#include "internal.h"
-#define DRIVER_VERSION "2.2"
-
static struct microcode_ops *microcode_ops;
-static bool dis_ucode_ldr = true;
-
-bool initrd_gone;
+bool dis_ucode_ldr = true;
-LIST_HEAD(microcode_cache);
+bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
+module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
/*
* Synchronization.
@@ -76,6 +75,8 @@ static u32 final_levels[] = {
0, /* T-101 terminator */
};
+struct early_load_data early_data;
+
/*
* Check the current patch level on this CPU.
*
@@ -90,10 +91,7 @@ static bool amd_check_current_patch_level(void)
native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
- if (IS_ENABLED(CONFIG_X86_32))
- levels = (u32 *)__pa_nodebug(&final_levels);
- else
- levels = final_levels;
+ levels = final_levels;
for (i = 0; levels[i]; i++) {
if (lvl == levels[i])
@@ -105,17 +103,8 @@ static bool amd_check_current_patch_level(void)
static bool __init check_loader_disabled_bsp(void)
{
static const char *__dis_opt_str = "dis_ucode_ldr";
-
-#ifdef CONFIG_X86_32
- const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
- const char *option = (const char *)__pa_nodebug(__dis_opt_str);
- bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
-
-#else /* CONFIG_X86_64 */
const char *cmdline = boot_command_line;
const char *option = __dis_opt_str;
- bool *res = &dis_ucode_ldr;
-#endif
/*
* CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
@@ -123,17 +112,17 @@ static bool __init check_loader_disabled_bsp(void)
* that's good enough as they don't land on the BSP path anyway.
*/
if (native_cpuid_ecx(1) & BIT(31))
- return *res;
+ return true;
if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
if (amd_check_current_patch_level())
- return *res;
+ return true;
}
if (cmdline_find_option_bool(cmdline, option) <= 0)
- *res = false;
+ dis_ucode_ldr = false;
- return *res;
+ return dis_ucode_ldr;
}
void __init load_ucode_bsp(void)
@@ -166,25 +155,16 @@ void __init load_ucode_bsp(void)
return;
if (intel)
- load_ucode_intel_bsp();
+ load_ucode_intel_bsp(&early_data);
else
- load_ucode_amd_early(cpuid_1_eax);
-}
-
-static bool check_loader_disabled_ap(void)
-{
-#ifdef CONFIG_X86_32
- return *((bool *)__pa_nodebug(&dis_ucode_ldr));
-#else
- return dis_ucode_ldr;
-#endif
+ load_ucode_amd_bsp(&early_data, cpuid_1_eax);
}
void load_ucode_ap(void)
{
unsigned int cpuid_1_eax;
- if (check_loader_disabled_ap())
+ if (dis_ucode_ldr)
return;
cpuid_1_eax = native_cpuid_eax(1);
@@ -196,97 +176,44 @@ void load_ucode_ap(void)
break;
case X86_VENDOR_AMD:
if (x86_family(cpuid_1_eax) >= 0x10)
- load_ucode_amd_early(cpuid_1_eax);
+ load_ucode_amd_ap(cpuid_1_eax);
break;
default:
break;
}
}
-static int __init save_microcode_in_initrd(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- int ret = -EINVAL;
-
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- if (c->x86 >= 6)
- ret = save_microcode_in_initrd_intel();
- break;
- case X86_VENDOR_AMD:
- if (c->x86 >= 0x10)
- ret = save_microcode_in_initrd_amd(cpuid_eax(1));
- break;
- default:
- break;
- }
-
- initrd_gone = true;
-
- return ret;
-}
-
-struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
+struct cpio_data __init find_microcode_in_initrd(const char *path)
{
#ifdef CONFIG_BLK_DEV_INITRD
unsigned long start = 0;
size_t size;
#ifdef CONFIG_X86_32
- struct boot_params *params;
-
- if (use_pa)
- params = (struct boot_params *)__pa_nodebug(&boot_params);
- else
- params = &boot_params;
-
- size = params->hdr.ramdisk_size;
-
- /*
- * Set start only if we have an initrd image. We cannot use initrd_start
- * because it is not set that early yet.
- */
+ size = boot_params.hdr.ramdisk_size;
+ /* Early load on BSP has a temporary mapping. */
if (size)
- start = params->hdr.ramdisk_image;
+ start = initrd_start_early;
-# else /* CONFIG_X86_64 */
+#else /* CONFIG_X86_64 */
size = (unsigned long)boot_params.ext_ramdisk_size << 32;
size |= boot_params.hdr.ramdisk_size;
if (size) {
start = (unsigned long)boot_params.ext_ramdisk_image << 32;
start |= boot_params.hdr.ramdisk_image;
-
start += PAGE_OFFSET;
}
-# endif
+#endif
/*
* Fixup the start address: after reserve_initrd() runs, initrd_start
* has the virtual address of the beginning of the initrd. It also
* possibly relocates the ramdisk. In either case, initrd_start contains
* the updated address so use that instead.
- *
- * initrd_gone is for the hotplug case where we've thrown out initrd
- * already.
*/
- if (!use_pa) {
- if (initrd_gone)
- return (struct cpio_data){ NULL, 0, "" };
- if (initrd_start)
- start = initrd_start;
- } else {
- /*
- * The picture with physical addresses is a bit different: we
- * need to get the *physical* address to which the ramdisk was
- * relocated, i.e., relocated_ramdisk (not initrd_start) and
- * since we're running from physical addresses, we need to access
- * relocated_ramdisk through its *physical* address too.
- */
- u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk);
- if (*rr)
- start = *rr;
- }
+ if (initrd_start)
+ start = initrd_start;
return find_cpio_data(path, (void *)start, size, NULL);
#else /* !CONFIG_BLK_DEV_INITRD */
@@ -330,117 +257,298 @@ static struct platform_device *microcode_pdev;
* requirement can be relaxed in the future. Right now, this is conservative
* and good.
*/
-#define SPINUNIT 100 /* 100 nsec */
+enum sibling_ctrl {
+ /* Spinwait with timeout */
+ SCTRL_WAIT,
+ /* Invoke the microcode_apply() callback */
+ SCTRL_APPLY,
+ /* Proceed without invoking the microcode_apply() callback */
+ SCTRL_DONE,
+};
-static int check_online_cpus(void)
+struct microcode_ctrl {
+ enum sibling_ctrl ctrl;
+ enum ucode_state result;
+ unsigned int ctrl_cpu;
+ bool nmi_enabled;
+};
+
+DEFINE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);
+static DEFINE_PER_CPU(struct microcode_ctrl, ucode_ctrl);
+static atomic_t late_cpus_in, offline_in_nmi;
+static unsigned int loops_per_usec;
+static cpumask_t cpu_offline_mask;
+
+static noinstr bool wait_for_cpus(atomic_t *cnt)
{
- unsigned int cpu;
+ unsigned int timeout, loops;
- /*
- * Make sure all CPUs are online. It's fine for SMT to be disabled if
- * all the primary threads are still online.
- */
- for_each_present_cpu(cpu) {
- if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) {
- pr_err("Not all CPUs online, aborting microcode update.\n");
- return -EINVAL;
+ WARN_ON_ONCE(raw_atomic_dec_return(cnt) < 0);
+
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (!raw_atomic_read(cnt))
+ return true;
+
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
+
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
}
}
-
- return 0;
+ /* Prevent the late comers from making progress and let them time out */
+ raw_atomic_inc(cnt);
+ return false;
}
-static atomic_t late_cpus_in;
-static atomic_t late_cpus_out;
-
-static int __wait_for_cpus(atomic_t *t, long long timeout)
+static noinstr bool wait_for_ctrl(void)
{
- int all_cpus = num_online_cpus();
+ unsigned int timeout, loops;
- atomic_inc(t);
-
- while (atomic_read(t) < all_cpus) {
- if (timeout < SPINUNIT) {
- pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
- all_cpus - atomic_read(t));
- return 1;
- }
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (raw_cpu_read(ucode_ctrl.ctrl) != SCTRL_WAIT)
+ return true;
- ndelay(SPINUNIT);
- timeout -= SPINUNIT;
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
- touch_nmi_watchdog();
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+ }
}
- return 0;
+ return false;
}
/*
- * Returns:
- * < 0 - on error
- * 0 - success (no update done or microcode was updated)
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
*/
-static int __reload_late(void *info)
+static noinstr bool load_secondary_wait(unsigned int ctrl_cpu)
{
- int cpu = smp_processor_id();
- enum ucode_state err;
- int ret = 0;
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ raw_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ return false;
+ }
/*
- * Wait for all CPUs to arrive. A load will not be attempted unless all
- * CPUs show up.
- * */
- if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
- return -1;
+ * Wait for primary threads to complete. If one of them hangs due
+ * to the update, there is no way out. This is non-recoverable
+ * because the CPU might hold locks or resources and confuse the
+ * scheduler, watchdogs etc. There is no way to safely evacuate the
+ * machine.
+ */
+ if (wait_for_ctrl())
+ return true;
+ instrumentation_begin();
+ panic("Microcode load: Primary CPU %d timed out\n", ctrl_cpu);
+ instrumentation_end();
+}
+
+/*
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
+ */
+static noinstr void load_secondary(unsigned int cpu)
+{
+ unsigned int ctrl_cpu = raw_cpu_read(ucode_ctrl.ctrl_cpu);
+ enum ucode_state ret;
+
+ if (!load_secondary_wait(ctrl_cpu)) {
+ instrumentation_begin();
+ pr_err_once("load: %d CPUs timed out\n",
+ atomic_read(&late_cpus_in) - 1);
+ instrumentation_end();
+ return;
+ }
+
+ /* Primary thread completed. Allow to invoke instrumentable code */
+ instrumentation_begin();
/*
- * On an SMT system, it suffices to load the microcode on one sibling of
- * the core because the microcode engine is shared between the threads.
- * Synchronization still needs to take place so that no concurrent
- * loading attempts happen on multiple threads of an SMT core. See
- * below.
+ * If the primary succeeded then invoke the apply() callback,
+ * otherwise copy the state from the primary thread.
*/
- if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu)
- err = microcode_ops->apply_microcode(cpu);
+ if (this_cpu_read(ucode_ctrl.ctrl) == SCTRL_APPLY)
+ ret = microcode_ops->apply_microcode(cpu);
else
- goto wait_for_siblings;
+ ret = per_cpu(ucode_ctrl.result, ctrl_cpu);
- if (err >= UCODE_NFOUND) {
- if (err == UCODE_ERROR) {
- pr_warn("Error reloading microcode on CPU %d\n", cpu);
- ret = -1;
- }
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
+ instrumentation_end();
+}
+
+static void __load_primary(unsigned int cpu)
+{
+ struct cpumask *secondaries = topology_sibling_cpumask(cpu);
+ enum sibling_ctrl ctrl;
+ enum ucode_state ret;
+ unsigned int sibling;
+
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ this_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ pr_err_once("load: %d CPUs timed out\n", atomic_read(&late_cpus_in) - 1);
+ return;
}
-wait_for_siblings:
- if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC))
- panic("Timeout during microcode update!\n");
+ ret = microcode_ops->apply_microcode(cpu);
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
/*
- * At least one thread has completed update on each core.
- * For others, simply call the update to make sure the
- * per-cpu cpuinfo can be updated with right microcode
- * revision.
+ * If the update was successful, let the siblings run the apply()
+ * callback. If not, tell them it's done. This also covers the
+ * case where the CPU has uniform loading at package or system
+ * scope implemented but does not advertise it.
*/
- if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu)
- err = microcode_ops->apply_microcode(cpu);
+ if (ret == UCODE_UPDATED || ret == UCODE_OK)
+ ctrl = SCTRL_APPLY;
+ else
+ ctrl = SCTRL_DONE;
+
+ for_each_cpu(sibling, secondaries) {
+ if (sibling != cpu)
+ per_cpu(ucode_ctrl.ctrl, sibling) = ctrl;
+ }
+}
+
+static bool kick_offline_cpus(unsigned int nr_offl)
+{
+ unsigned int cpu, timeout;
+
+ for_each_cpu(cpu, &cpu_offline_mask) {
+ /* Enable the rendezvous handler and send NMI */
+ per_cpu(ucode_ctrl.nmi_enabled, cpu) = true;
+ apic_send_nmi_to_offline_cpu(cpu);
+ }
+
+ /* Wait for them to arrive */
+ for (timeout = 0; timeout < (USEC_PER_SEC / 2); timeout++) {
+ if (atomic_read(&offline_in_nmi) == nr_offl)
+ return true;
+ udelay(1);
+ }
+ /* Let the others time out */
+ return false;
+}
+
+static void release_offline_cpus(void)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, &cpu_offline_mask)
+ per_cpu(ucode_ctrl.ctrl, cpu) = SCTRL_DONE;
+}
+
+static void load_primary(unsigned int cpu)
+{
+ unsigned int nr_offl = cpumask_weight(&cpu_offline_mask);
+ bool proceed = true;
+
+ /* Kick soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ proceed = kick_offline_cpus(nr_offl);
+
+ /* If the soft-offlined CPUs did not respond, abort */
+ if (proceed)
+ __load_primary(cpu);
- return ret;
+ /* Unconditionally release soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ release_offline_cpus();
}
/*
- * Reload microcode late on all CPUs. Wait for a sec until they
- * all gather together.
+ * Minimal stub rendezvous handler for soft-offlined CPUs which participate
+ * in the NMI rendezvous to protect against a concurrent NMI on affected
+ * CPUs.
*/
-static int microcode_reload_late(void)
+void noinstr microcode_offline_nmi_handler(void)
{
- int old = boot_cpu_data.microcode, ret;
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return;
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ raw_cpu_write(ucode_ctrl.result, UCODE_OFFLINE);
+ raw_atomic_inc(&offline_in_nmi);
+ wait_for_ctrl();
+}
+
+static noinstr bool microcode_update_handler(void)
+{
+ unsigned int cpu = raw_smp_processor_id();
+
+ if (raw_cpu_read(ucode_ctrl.ctrl_cpu) == cpu) {
+ instrumentation_begin();
+ load_primary(cpu);
+ instrumentation_end();
+ } else {
+ load_secondary(cpu);
+ }
+
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+
+ return true;
+}
+
+/*
+ * Protection against instrumentation is required for CPUs which are not
+ * safe against an NMI which is delivered to the secondary SMT sibling
+ * while the primary thread updates the microcode. Instrumentation can end
+ * up in #INT3, #DB and #PF. The IRET from those exceptions reenables NMI
+ * which is the opposite of what the NMI rendezvous is trying to achieve.
+ *
+ * The primary thread is safe versus instrumentation as the actual
+ * microcode update handles this correctly. It's only the sibling code
+ * path which must be NMI safe until the primary thread completed the
+ * update.
+ */
+bool noinstr microcode_nmi_handler(void)
+{
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return false;
+
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ return microcode_update_handler();
+}
+
+static int load_cpus_stopped(void *unused)
+{
+ if (microcode_ops->use_nmi) {
+ /* Enable the NMI handler and raise NMI */
+ this_cpu_write(ucode_ctrl.nmi_enabled, true);
+ apic->send_IPI(smp_processor_id(), NMI_VECTOR);
+ } else {
+ /* Just invoke the handler directly */
+ microcode_update_handler();
+ }
+ return 0;
+}
+
+static int load_late_stop_cpus(bool is_safe)
+{
+ unsigned int cpu, updated = 0, failed = 0, timedout = 0, siblings = 0;
+ unsigned int nr_offl, offline = 0;
+ int old_rev = boot_cpu_data.microcode;
struct cpuinfo_x86 prev_info;
- pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n");
- pr_err("You should switch to early loading, if possible.\n");
+ if (!is_safe) {
+ pr_err("Late microcode loading without minimal revision check.\n");
+ pr_err("You should switch to early loading, if possible.\n");
+ }
- atomic_set(&late_cpus_in, 0);
- atomic_set(&late_cpus_out, 0);
+ atomic_set(&late_cpus_in, num_online_cpus());
+ atomic_set(&offline_in_nmi, 0);
+ loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000);
/*
* Take a snapshot before the microcode update in order to compare and
@@ -448,52 +556,162 @@ static int microcode_reload_late(void)
*/
store_cpu_caps(&prev_info);
- ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
- if (!ret) {
- pr_info("Reload succeeded, microcode revision: 0x%x -> 0x%x\n",
- old, boot_cpu_data.microcode);
- microcode_check(&prev_info);
- } else {
- pr_info("Reload failed, current microcode revision: 0x%x\n",
- boot_cpu_data.microcode);
+ if (microcode_ops->use_nmi)
+ static_branch_enable_cpuslocked(&microcode_nmi_handler_enable);
+
+ stop_machine_cpuslocked(load_cpus_stopped, NULL, cpu_online_mask);
+
+ if (microcode_ops->use_nmi)
+ static_branch_disable_cpuslocked(&microcode_nmi_handler_enable);
+
+ /* Analyze the results */
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ switch (per_cpu(ucode_ctrl.result, cpu)) {
+ case UCODE_UPDATED: updated++; break;
+ case UCODE_TIMEOUT: timedout++; break;
+ case UCODE_OK: siblings++; break;
+ case UCODE_OFFLINE: offline++; break;
+ default: failed++; break;
+ }
}
- return ret;
+ if (microcode_ops->finalize_late_load)
+ microcode_ops->finalize_late_load(!updated);
+
+ if (!updated) {
+ /* Nothing changed. */
+ if (!failed && !timedout)
+ return 0;
+
+ nr_offl = cpumask_weight(&cpu_offline_mask);
+ if (offline < nr_offl) {
+ pr_warn("%u offline siblings did not respond.\n",
+ nr_offl - atomic_read(&offline_in_nmi));
+ return -EIO;
+ }
+ pr_err("update failed: %u CPUs failed %u CPUs timed out\n",
+ failed, timedout);
+ return -EIO;
+ }
+
+ if (!is_safe || failed || timedout)
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
+ pr_info("load: updated on %u primary CPUs with %u siblings\n", updated, siblings);
+ if (failed || timedout) {
+ pr_err("load incomplete. %u CPUs timed out or failed\n",
+ num_online_cpus() - (updated + siblings));
+ }
+ pr_info("revision: 0x%x -> 0x%x\n", old_rev, boot_cpu_data.microcode);
+ microcode_check(&prev_info);
+
+ return updated + siblings == num_online_cpus() ? 0 : -EIO;
+}
+
+/*
+ * This function does two things:
+ *
+ * 1) Ensure that all required CPUs which are present and have been booted
+ * once are online.
+ *
+ * To pass this check, all primary threads must be online.
+ *
+ * If the microcode load is not safe against NMI then all SMT threads
+ * must be online as well because they still react to NMIs when they are
+ * soft-offlined and parked in one of the play_dead() variants. So if a
+ * NMI hits while the primary thread updates the microcode the resulting
+ * behaviour is undefined. The default play_dead() implementation on
+ * modern CPUs uses MWAIT, which is also not guaranteed to be safe
+ * against a microcode update which affects MWAIT.
+ *
+ * As soft-offlined CPUs still react on NMIs, the SMT sibling
+ * restriction can be lifted when the vendor driver signals to use NMI
+ * for rendezvous and the APIC provides a mechanism to send an NMI to a
+ * soft-offlined CPU. The soft-offlined CPUs are then able to
+ * participate in the rendezvous in a trivial stub handler.
+ *
+ * 2) Initialize the per CPU control structure and create a cpumask
+ * which contains "offline"; secondary threads, so they can be handled
+ * correctly by a control CPU.
+ */
+static bool setup_cpus(void)
+{
+ struct microcode_ctrl ctrl = { .ctrl = SCTRL_WAIT, .result = -1, };
+ bool allow_smt_offline;
+ unsigned int cpu;
+
+ allow_smt_offline = microcode_ops->nmi_safe ||
+ (microcode_ops->use_nmi && apic->nmi_to_offline_cpu);
+
+ cpumask_clear(&cpu_offline_mask);
+
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ /*
+ * Offline CPUs sit in one of the play_dead() functions
+ * with interrupts disabled, but they still react on NMIs
+ * and execute arbitrary code. Also MWAIT being updated
+ * while the offline CPU sits there is not necessarily safe
+ * on all CPU variants.
+ *
+ * Mark them in the offline_cpus mask which will be handled
+ * by CPU0 later in the update process.
+ *
+ * Ensure that the primary thread is online so that it is
+ * guaranteed that all cores are updated.
+ */
+ if (!cpu_online(cpu)) {
+ if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
+ pr_err("CPU %u not online, loading aborted\n", cpu);
+ return false;
+ }
+ cpumask_set_cpu(cpu, &cpu_offline_mask);
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ continue;
+ }
+
+ /*
+ * Initialize the per CPU state. This is core scope for now,
+ * but prepared to take package or system scope into account.
+ */
+ ctrl.ctrl_cpu = cpumask_first(topology_sibling_cpumask(cpu));
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ }
+ return true;
+}
+
+static int load_late_locked(void)
+{
+ if (!setup_cpus())
+ return -EBUSY;
+
+ switch (microcode_ops->request_microcode_fw(0, &microcode_pdev->dev)) {
+ case UCODE_NEW:
+ return load_late_stop_cpus(false);
+ case UCODE_NEW_SAFE:
+ return load_late_stop_cpus(true);
+ case UCODE_NFOUND:
+ return -ENOENT;
+ default:
+ return -EBADFD;
+ }
}
static ssize_t reload_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t size)
{
- enum ucode_state tmp_ret = UCODE_OK;
- int bsp = boot_cpu_data.cpu_index;
unsigned long val;
- ssize_t ret = 0;
+ ssize_t ret;
ret = kstrtoul(buf, 0, &val);
if (ret || val != 1)
return -EINVAL;
cpus_read_lock();
-
- ret = check_online_cpus();
- if (ret)
- goto put;
-
- tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev);
- if (tmp_ret != UCODE_NEW)
- goto put;
-
- ret = microcode_reload_late();
-put:
+ ret = load_late_locked();
cpus_read_unlock();
- if (ret == 0)
- ret = size;
-
- add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
-
- return ret;
+ return ret ? : size;
}
static DEVICE_ATTR_WO(reload);
@@ -535,17 +753,6 @@ static void microcode_fini_cpu(int cpu)
microcode_ops->microcode_fini_cpu(cpu);
}
-static enum ucode_state microcode_init_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- memset(uci, 0, sizeof(*uci));
-
- microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
-
- return microcode_ops->apply_microcode(cpu);
-}
-
/**
* microcode_bsp_resume - Update boot CPU microcode during resume.
*/
@@ -564,19 +771,18 @@ static struct syscore_ops mc_syscore_ops = {
.resume = microcode_bsp_resume,
};
-static int mc_cpu_starting(unsigned int cpu)
-{
- enum ucode_state err = microcode_ops->apply_microcode(cpu);
-
- pr_debug("%s: CPU%d, err: %d\n", __func__, cpu, err);
-
- return err == UCODE_ERROR;
-}
-
static int mc_cpu_online(unsigned int cpu)
{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct device *dev = get_cpu_device(cpu);
+ memset(uci, 0, sizeof(*uci));
+
+ microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
return 0;
@@ -584,33 +790,13 @@ static int mc_cpu_online(unsigned int cpu)
static int mc_cpu_down_prep(unsigned int cpu)
{
- struct device *dev;
-
- dev = get_cpu_device(cpu);
+ struct device *dev = get_cpu_device(cpu);
microcode_fini_cpu(cpu);
-
- /* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
- pr_debug("%s: CPU%d\n", __func__, cpu);
-
return 0;
}
-static void setup_online_cpu(struct work_struct *work)
-{
- int cpu = smp_processor_id();
- enum ucode_state err;
-
- err = microcode_init_cpu(cpu);
- if (err == UCODE_ERROR) {
- pr_err("Error applying microcode on CPU%d\n", cpu);
- return;
- }
-
- mc_cpu_online(cpu);
-}
-
static struct attribute *cpu_root_microcode_attrs[] = {
#ifdef CONFIG_MICROCODE_LATE_LOADING
&dev_attr_reload.attr,
@@ -642,6 +828,11 @@ static int __init microcode_init(void)
if (!microcode_ops)
return -ENODEV;
+ pr_info_once("Current revision: 0x%08x\n", (early_data.new_rev ?: early_data.old_rev));
+
+ if (early_data.new_rev)
+ pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev);
+
microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0);
if (IS_ERR(microcode_pdev))
return PTR_ERR(microcode_pdev);
@@ -656,16 +847,9 @@ static int __init microcode_init(void)
}
}
- /* Do per-CPU setup */
- schedule_on_each_cpu(setup_online_cpu);
-
register_syscore_ops(&mc_syscore_ops);
- cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting",
- mc_cpu_starting, NULL);
- cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
- mc_cpu_online, mc_cpu_down_prep);
-
- pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
+ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
+ mc_cpu_online, mc_cpu_down_prep);
return 0;
@@ -674,5 +858,4 @@ static int __init microcode_init(void)
return error;
}
-fs_initcall(save_microcode_in_initrd);
late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 94dd6af9c963..857e608af641 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -14,7 +14,6 @@
#include <linux/earlycpio.h>
#include <linux/firmware.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/initrd.h>
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -32,11 +31,14 @@
static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
+#define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL)
+
/* Current microcode patch used in early patching on the APs. */
-static struct microcode_intel *intel_ucode_patch;
+static struct microcode_intel *ucode_patch_va __read_mostly;
+static struct microcode_intel *ucode_patch_late __read_mostly;
/* last level cache size per core */
-static int llc_size_per_core;
+static unsigned int llc_size_per_core __ro_after_init;
/* microcode format is extended from prescott processors */
struct extended_signature {
@@ -66,60 +68,52 @@ static inline unsigned int exttable_size(struct extended_sigtable *et)
return et->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE;
}
-int intel_cpu_collect_info(struct ucode_cpu_info *uci)
+void intel_collect_cpu_info(struct cpu_signature *sig)
{
- unsigned int val[2];
- unsigned int family, model;
- struct cpu_signature csig = { 0 };
- unsigned int eax, ebx, ecx, edx;
-
- memset(uci, 0, sizeof(*uci));
-
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- csig.sig = eax;
+ sig->sig = cpuid_eax(1);
+ sig->pf = 0;
+ sig->rev = intel_get_microcode_revision();
- family = x86_family(eax);
- model = x86_model(eax);
+ if (x86_model(sig->sig) >= 5 || x86_family(sig->sig) > 6) {
+ unsigned int val[2];
- if (model >= 5 || family > 6) {
/* get processor flags from MSR 0x17 */
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig.pf = 1 << ((val[1] >> 18) & 7);
+ sig->pf = 1 << ((val[1] >> 18) & 7);
}
+}
+EXPORT_SYMBOL_GPL(intel_collect_cpu_info);
- csig.rev = intel_get_microcode_revision();
-
- uci->cpu_sig = csig;
+static inline bool cpu_signatures_match(struct cpu_signature *s1, unsigned int sig2,
+ unsigned int pf2)
+{
+ if (s1->sig != sig2)
+ return false;
- return 0;
+ /* Processor flags are either both 0 or they intersect. */
+ return ((!s1->pf && !pf2) || (s1->pf & pf2));
}
-EXPORT_SYMBOL_GPL(intel_cpu_collect_info);
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int intel_find_matching_signature(void *mc, unsigned int csig, int cpf)
+bool intel_find_matching_signature(void *mc, struct cpu_signature *sig)
{
struct microcode_header_intel *mc_hdr = mc;
- struct extended_sigtable *ext_hdr;
struct extended_signature *ext_sig;
+ struct extended_sigtable *ext_hdr;
int i;
- if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
- return 1;
+ if (cpu_signatures_match(sig, mc_hdr->sig, mc_hdr->pf))
+ return true;
/* Look for ext. headers: */
if (get_totalsize(mc_hdr) <= intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE)
- return 0;
+ return false;
ext_hdr = mc + intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE;
ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
for (i = 0; i < ext_hdr->count; i++) {
- if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
- return 1;
+ if (cpu_signatures_match(sig, ext_sig->sig, ext_sig->pf))
+ return true;
ext_sig++;
}
return 0;
@@ -240,264 +234,91 @@ int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type)
}
EXPORT_SYMBOL_GPL(intel_microcode_sanity_check);
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
+static void update_ucode_pointer(struct microcode_intel *mc)
{
- struct microcode_header_intel *mc_hdr = mc;
-
- if (mc_hdr->rev <= new_rev)
- return 0;
-
- return intel_find_matching_signature(mc, csig, cpf);
-}
-
-static struct ucode_patch *memdup_patch(void *data, unsigned int size)
-{
- struct ucode_patch *p;
-
- p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
- if (!p)
- return NULL;
-
- p->data = kmemdup(data, size, GFP_KERNEL);
- if (!p->data) {
- kfree(p);
- return NULL;
- }
-
- return p;
-}
-
-static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size)
-{
- struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- struct ucode_patch *iter, *tmp, *p = NULL;
- bool prev_found = false;
- unsigned int sig, pf;
-
- mc_hdr = (struct microcode_header_intel *)data;
-
- list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
- mc_saved_hdr = (struct microcode_header_intel *)iter->data;
- sig = mc_saved_hdr->sig;
- pf = mc_saved_hdr->pf;
-
- if (intel_find_matching_signature(data, sig, pf)) {
- prev_found = true;
-
- if (mc_hdr->rev <= mc_saved_hdr->rev)
- continue;
-
- p = memdup_patch(data, size);
- if (!p)
- pr_err("Error allocating buffer %p\n", data);
- else {
- list_replace(&iter->plist, &p->plist);
- kfree(iter->data);
- kfree(iter);
- }
- }
- }
+ kvfree(ucode_patch_va);
/*
- * There weren't any previous patches found in the list cache; save the
- * newly found.
+ * Save the virtual address for early loading and for eventual free
+ * on late loading.
*/
- if (!prev_found) {
- p = memdup_patch(data, size);
- if (!p)
- pr_err("Error allocating buffer for %p\n", data);
- else
- list_add_tail(&p->plist, &microcode_cache);
- }
-
- if (!p)
- return;
+ ucode_patch_va = mc;
+}
- if (!intel_find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
- return;
+static void save_microcode_patch(struct microcode_intel *patch)
+{
+ unsigned int size = get_totalsize(&patch->hdr);
+ struct microcode_intel *mc;
- /*
- * Save for early loading. On 32-bit, that needs to be a physical
- * address as the APs are running from physical addresses, before
- * paging has been enabled.
- */
- if (IS_ENABLED(CONFIG_X86_32))
- intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+ mc = kvmemdup(patch, size, GFP_KERNEL);
+ if (mc)
+ update_ucode_pointer(mc);
else
- intel_ucode_patch = p->data;
+ pr_err("Unable to allocate microcode memory size: %u\n", size);
}
-/*
- * Get microcode matching with BSP's model. Only CPUs with the same model as
- * BSP can stay in the platform.
- */
-static struct microcode_intel *
-scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
+/* Scan blob for microcode matching the boot CPUs family, model, stepping */
+static __init struct microcode_intel *scan_microcode(void *data, size_t size,
+ struct ucode_cpu_info *uci,
+ bool save)
{
struct microcode_header_intel *mc_header;
struct microcode_intel *patch = NULL;
+ u32 cur_rev = uci->cpu_sig.rev;
unsigned int mc_size;
- while (size) {
- if (size < sizeof(struct microcode_header_intel))
- break;
-
+ for (; size >= sizeof(struct microcode_header_intel); size -= mc_size, data += mc_size) {
mc_header = (struct microcode_header_intel *)data;
mc_size = get_totalsize(mc_header);
- if (!mc_size ||
- mc_size > size ||
+ if (!mc_size || mc_size > size ||
intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0)
break;
- size -= mc_size;
-
- if (!intel_find_matching_signature(data, uci->cpu_sig.sig,
- uci->cpu_sig.pf)) {
- data += mc_size;
+ if (!intel_find_matching_signature(data, &uci->cpu_sig))
continue;
- }
+ /*
+ * For saving the early microcode, find the matching revision which
+ * was loaded on the BSP.
+ *
+ * On the BSP during early boot, find a newer revision than
+ * actually loaded in the CPU.
+ */
if (save) {
- save_microcode_patch(uci, data, mc_size);
- goto next;
- }
-
-
- if (!patch) {
- if (!has_newer_microcode(data,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf,
- uci->cpu_sig.rev))
- goto next;
-
- } else {
- struct microcode_header_intel *phdr = &patch->hdr;
-
- if (!has_newer_microcode(data,
- phdr->sig,
- phdr->pf,
- phdr->rev))
- goto next;
+ if (cur_rev != mc_header->rev)
+ continue;
+ } else if (cur_rev >= mc_header->rev) {
+ continue;
}
- /* We have a newer patch, save it. */
patch = data;
-
-next:
- data += mc_size;
- }
-
- if (size)
- return NULL;
-
- return patch;
-}
-
-static bool load_builtin_intel_microcode(struct cpio_data *cp)
-{
- unsigned int eax = 1, ebx, ecx = 0, edx;
- struct firmware fw;
- char name[30];
-
- if (IS_ENABLED(CONFIG_X86_32))
- return false;
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- sprintf(name, "intel-ucode/%02x-%02x-%02x",
- x86_family(eax), x86_model(eax), x86_stepping(eax));
-
- if (firmware_request_builtin(&fw, name)) {
- cp->size = fw.size;
- cp->data = (void *)fw.data;
- return true;
+ cur_rev = mc_header->rev;
}
- return false;
+ return size ? NULL : patch;
}
-static void print_ucode_info(int old_rev, int new_rev, unsigned int date)
+static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
+ struct microcode_intel *mc,
+ u32 *cur_rev)
{
- pr_info_once("updated early: 0x%x -> 0x%x, date = %04x-%02x-%02x\n",
- old_rev,
- new_rev,
- date & 0xffff,
- date >> 24,
- (date >> 16) & 0xff);
-}
-
-#ifdef CONFIG_X86_32
-
-static int delay_ucode_info;
-static int current_mc_date;
-static int early_old_rev;
-
-/*
- * Print early updated ucode info after printk works. This is delayed info dump.
- */
-void show_ucode_info_early(void)
-{
- struct ucode_cpu_info uci;
-
- if (delay_ucode_info) {
- intel_cpu_collect_info(&uci);
- print_ucode_info(early_old_rev, uci.cpu_sig.rev, current_mc_date);
- delay_ucode_info = 0;
- }
-}
-
-/*
- * At this point, we can not call printk() yet. Delay printing microcode info in
- * show_ucode_info_early() until printk() works.
- */
-static void print_ucode(int old_rev, int new_rev, int date)
-{
- int *delay_ucode_info_p;
- int *current_mc_date_p;
- int *early_old_rev_p;
-
- delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
- current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
- early_old_rev_p = (int *)__pa_nodebug(&early_old_rev);
-
- *delay_ucode_info_p = 1;
- *current_mc_date_p = date;
- *early_old_rev_p = old_rev;
-}
-#else
-
-static inline void print_ucode(int old_rev, int new_rev, int date)
-{
- print_ucode_info(old_rev, new_rev, date);
-}
-#endif
-
-static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
-{
- struct microcode_intel *mc;
- u32 rev, old_rev;
+ u32 rev;
- mc = uci->mc;
if (!mc)
- return 0;
+ return UCODE_NFOUND;
/*
* Save us the MSR write below - which is a particular expensive
* operation - when the other hyperthread has updated the microcode
* already.
*/
- rev = intel_get_microcode_revision();
- if (rev >= mc->hdr.rev) {
- uci->cpu_sig.rev = rev;
+ *cur_rev = intel_get_microcode_revision();
+ if (*cur_rev >= mc->hdr.rev) {
+ uci->cpu_sig.rev = *cur_rev;
return UCODE_OK;
}
- old_rev = rev;
-
/*
* Writeback and invalidate caches before updating microcode to avoid
* internal issues depending on what the microcode is updating.
@@ -509,247 +330,173 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
rev = intel_get_microcode_revision();
if (rev != mc->hdr.rev)
- return -1;
+ return UCODE_ERROR;
uci->cpu_sig.rev = rev;
+ return UCODE_UPDATED;
+}
- if (early)
- print_ucode(old_rev, uci->cpu_sig.rev, mc->hdr.date);
- else
- print_ucode_info(old_rev, uci->cpu_sig.rev, mc->hdr.date);
+static enum ucode_state apply_microcode_early(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc = uci->mc;
+ u32 cur_rev;
- return 0;
+ return __apply_microcode(uci, mc, &cur_rev);
}
-int __init save_microcode_in_initrd_intel(void)
+static __init bool load_builtin_intel_microcode(struct cpio_data *cp)
{
- struct ucode_cpu_info uci;
- struct cpio_data cp;
-
- /*
- * initrd is going away, clear patch ptr. We will scan the microcode one
- * last time before jettisoning and save a patch, if found. Then we will
- * update that pointer too, with a stable patch address to use when
- * resuming the cores.
- */
- intel_ucode_patch = NULL;
+ unsigned int eax = 1, ebx, ecx = 0, edx;
+ struct firmware fw;
+ char name[30];
- if (!load_builtin_intel_microcode(&cp))
- cp = find_microcode_in_initrd(ucode_path, false);
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
- if (!(cp.data && cp.size))
- return 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
- intel_cpu_collect_info(&uci);
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ x86_family(eax), x86_model(eax), x86_stepping(eax));
- scan_microcode(cp.data, cp.size, &uci, true);
- return 0;
+ if (firmware_request_builtin(&fw, name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+ return false;
}
-/*
- * @res_patch, output: a pointer to the patch we found.
- */
-static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
+static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info *uci, bool save)
{
- static const char *path;
struct cpio_data cp;
- bool use_pa;
-
- if (IS_ENABLED(CONFIG_X86_32)) {
- path = (const char *)__pa_nodebug(ucode_path);
- use_pa = true;
- } else {
- path = ucode_path;
- use_pa = false;
- }
- /* try built-in microcode first */
+ intel_collect_cpu_info(&uci->cpu_sig);
+
if (!load_builtin_intel_microcode(&cp))
- cp = find_microcode_in_initrd(path, use_pa);
+ cp = find_microcode_in_initrd(ucode_path);
if (!(cp.data && cp.size))
return NULL;
- intel_cpu_collect_info(uci);
-
- return scan_microcode(cp.data, cp.size, uci, false);
+ return scan_microcode(cp.data, cp.size, uci, save);
}
-void __init load_ucode_intel_bsp(void)
+/*
+ * Invoked from an early init call to save the microcode blob which was
+ * selected during early boot when mm was not usable. The microcode must be
+ * saved because initrd is going away. It's an early init call so the APs
+ * just can use the pointer and do not have to scan initrd/builtin firmware
+ * again.
+ */
+static int __init save_builtin_microcode(void)
{
- struct microcode_intel *patch;
struct ucode_cpu_info uci;
- patch = __load_ucode_intel(&uci);
- if (!patch)
- return;
+ if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED)
+ return 0;
- uci.mc = patch;
+ if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
- apply_microcode_early(&uci, true);
+ uci.mc = get_microcode_blob(&uci, true);
+ if (uci.mc)
+ save_microcode_patch(uci.mc);
+ return 0;
}
+early_initcall(save_builtin_microcode);
-void load_ucode_intel_ap(void)
+/* Load microcode on BSP from initrd or builtin blobs */
+void __init load_ucode_intel_bsp(struct early_load_data *ed)
{
- struct microcode_intel *patch, **iup;
struct ucode_cpu_info uci;
- if (IS_ENABLED(CONFIG_X86_32))
- iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch);
- else
- iup = &intel_ucode_patch;
+ uci.mc = get_microcode_blob(&uci, false);
+ ed->old_rev = uci.cpu_sig.rev;
- if (!*iup) {
- patch = __load_ucode_intel(&uci);
- if (!patch)
- return;
-
- *iup = patch;
+ if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) {
+ ucode_patch_va = UCODE_BSP_LOADED;
+ ed->new_rev = uci.cpu_sig.rev;
}
-
- uci.mc = *iup;
-
- apply_microcode_early(&uci, true);
}
-static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
+void load_ucode_intel_ap(void)
{
- struct microcode_header_intel *phdr;
- struct ucode_patch *iter, *tmp;
-
- list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
-
- phdr = (struct microcode_header_intel *)iter->data;
-
- if (phdr->rev <= uci->cpu_sig.rev)
- continue;
-
- if (!intel_find_matching_signature(phdr,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf))
- continue;
+ struct ucode_cpu_info uci;
- return iter->data;
- }
- return NULL;
+ uci.mc = ucode_patch_va;
+ if (uci.mc)
+ apply_microcode_early(&uci);
}
+/* Reload microcode on resume */
void reload_ucode_intel(void)
{
- struct microcode_intel *p;
- struct ucode_cpu_info uci;
-
- intel_cpu_collect_info(&uci);
-
- p = find_patch(&uci);
- if (!p)
- return;
-
- uci.mc = p;
+ struct ucode_cpu_info uci = { .mc = ucode_patch_va, };
- apply_microcode_early(&uci, false);
+ if (uci.mc)
+ apply_microcode_early(&uci);
}
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu_num);
- unsigned int val[2];
-
- memset(csig, 0, sizeof(*csig));
-
- csig->sig = cpuid_eax(0x00000001);
-
- if ((c->x86_model >= 5) || (c->x86 > 6)) {
- /* get processor flags from MSR 0x17 */
- rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig->pf = 1 << ((val[1] >> 18) & 7);
- }
-
- csig->rev = c->microcode;
-
+ intel_collect_cpu_info(csig);
return 0;
}
-static enum ucode_state apply_microcode_intel(int cpu)
+static enum ucode_state apply_microcode_late(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
- struct microcode_intel *mc;
+ struct microcode_intel *mc = ucode_patch_late;
enum ucode_state ret;
- static int prev_rev;
- u32 rev;
+ u32 cur_rev;
- /* We should bind the task to the CPU */
- if (WARN_ON(raw_smp_processor_id() != cpu))
+ if (WARN_ON_ONCE(smp_processor_id() != cpu))
return UCODE_ERROR;
- /* Look for a newer patch in our cache: */
- mc = find_patch(uci);
- if (!mc) {
- mc = uci->mc;
- if (!mc)
- return UCODE_NFOUND;
- }
+ ret = __apply_microcode(uci, mc, &cur_rev);
+ if (ret != UCODE_UPDATED && ret != UCODE_OK)
+ return ret;
+
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
+ return ret;
+}
+
+static bool ucode_validate_minrev(struct microcode_header_intel *mc_header)
+{
+ int cur_rev = boot_cpu_data.microcode;
/*
- * Save us the MSR write below - which is a particular expensive
- * operation - when the other hyperthread has updated the microcode
- * already.
+ * When late-loading, ensure the header declares a minimum revision
+ * required to perform a late-load. The previously reserved field
+ * is 0 in older microcode blobs.
*/
- rev = intel_get_microcode_revision();
- if (rev >= mc->hdr.rev) {
- ret = UCODE_OK;
- goto out;
+ if (!mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Microcode header does not specify a required min version\n");
+ return false;
}
/*
- * Writeback and invalidate caches before updating microcode to avoid
- * internal issues depending on what the microcode is updating.
+ * Check whether the current revision is either greater or equal to
+ * to the minimum revision specified in the header.
*/
- native_wbinvd();
-
- /* write microcode via MSR 0x79 */
- wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
-
- rev = intel_get_microcode_revision();
-
- if (rev != mc->hdr.rev) {
- pr_err("CPU%d update to revision 0x%x failed\n",
- cpu, mc->hdr.rev);
- return UCODE_ERROR;
- }
-
- if (bsp && rev != prev_rev) {
- pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
- rev,
- mc->hdr.date & 0xffff,
- mc->hdr.date >> 24,
- (mc->hdr.date >> 16) & 0xff);
- prev_rev = rev;
+ if (cur_rev < mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Current revision 0x%x too old\n", cur_rev);
+ pr_info("Current should be at 0x%x or higher. Use early loading instead\n", mc_header->min_req_ver);
+ return false;
}
-
- ret = UCODE_UPDATED;
-
-out:
- uci->cpu_sig.rev = rev;
- c->microcode = rev;
-
- /* Update boot_cpu_data's revision too, if we're on the BSP: */
- if (bsp)
- boot_cpu_data.microcode = rev;
-
- return ret;
+ return true;
}
-static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
+static enum ucode_state parse_microcode_blobs(int cpu, struct iov_iter *iter)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- unsigned int curr_mc_size = 0, new_mc_size = 0;
- enum ucode_state ret = UCODE_OK;
- int new_rev = uci->cpu_sig.rev;
+ bool is_safe, new_is_safe = false;
+ int cur_rev = uci->cpu_sig.rev;
+ unsigned int curr_mc_size = 0;
u8 *new_mc = NULL, *mc = NULL;
- unsigned int csig, cpf;
while (iov_iter_count(iter)) {
struct microcode_header_intel mc_header;
@@ -758,68 +505,66 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) {
pr_err("error! Truncated or inaccessible header in microcode data file\n");
- break;
+ goto fail;
}
mc_size = get_totalsize(&mc_header);
if (mc_size < sizeof(mc_header)) {
pr_err("error! Bad data in microcode data file (totalsize too small)\n");
- break;
+ goto fail;
}
data_size = mc_size - sizeof(mc_header);
if (data_size > iov_iter_count(iter)) {
pr_err("error! Bad data in microcode data file (truncated file?)\n");
- break;
+ goto fail;
}
/* For performance reasons, reuse mc area when possible */
if (!mc || mc_size > curr_mc_size) {
- vfree(mc);
- mc = vmalloc(mc_size);
+ kvfree(mc);
+ mc = kvmalloc(mc_size, GFP_KERNEL);
if (!mc)
- break;
+ goto fail;
curr_mc_size = mc_size;
}
memcpy(mc, &mc_header, sizeof(mc_header));
data = mc + sizeof(mc_header);
if (!copy_from_iter_full(data, data_size, iter) ||
- intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0) {
- break;
- }
+ intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0)
+ goto fail;
- csig = uci->cpu_sig.sig;
- cpf = uci->cpu_sig.pf;
- if (has_newer_microcode(mc, csig, cpf, new_rev)) {
- vfree(new_mc);
- new_rev = mc_header.rev;
- new_mc = mc;
- new_mc_size = mc_size;
- mc = NULL; /* trigger new vmalloc */
- ret = UCODE_NEW;
- }
- }
+ if (cur_rev >= mc_header.rev)
+ continue;
- vfree(mc);
+ if (!intel_find_matching_signature(mc, &uci->cpu_sig))
+ continue;
- if (iov_iter_count(iter)) {
- vfree(new_mc);
- return UCODE_ERROR;
+ is_safe = ucode_validate_minrev(&mc_header);
+ if (force_minrev && !is_safe)
+ continue;
+
+ kvfree(new_mc);
+ cur_rev = mc_header.rev;
+ new_mc = mc;
+ new_is_safe = is_safe;
+ mc = NULL;
}
+ if (iov_iter_count(iter))
+ goto fail;
+
+ kvfree(mc);
if (!new_mc)
return UCODE_NFOUND;
- vfree(uci->mc);
- uci->mc = (struct microcode_intel *)new_mc;
-
- /* Save for CPU hotplug */
- save_microcode_patch(uci, new_mc, new_mc_size);
+ ucode_patch_late = (struct microcode_intel *)new_mc;
+ return new_is_safe ? UCODE_NEW_SAFE : UCODE_NEW;
- pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
- cpu, new_rev, uci->cpu_sig.rev);
-
- return ret;
+fail:
+ kvfree(mc);
+ kvfree(new_mc);
+ return UCODE_ERROR;
}
static bool is_blacklisted(unsigned int cpu)
@@ -868,26 +613,36 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
kvec.iov_base = (void *)firmware->data;
kvec.iov_len = firmware->size;
iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size);
- ret = generic_load_microcode(cpu, &iter);
+ ret = parse_microcode_blobs(cpu, &iter);
release_firmware(firmware);
return ret;
}
+static void finalize_late_load(int result)
+{
+ if (!result)
+ update_ucode_pointer(ucode_patch_late);
+ else
+ kvfree(ucode_patch_late);
+ ucode_patch_late = NULL;
+}
+
static struct microcode_ops microcode_intel_ops = {
- .request_microcode_fw = request_microcode_fw,
- .collect_cpu_info = collect_cpu_info,
- .apply_microcode = apply_microcode_intel,
+ .request_microcode_fw = request_microcode_fw,
+ .collect_cpu_info = collect_cpu_info,
+ .apply_microcode = apply_microcode_late,
+ .finalize_late_load = finalize_late_load,
+ .use_nmi = IS_ENABLED(CONFIG_X86_64),
};
-static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
+static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c)
{
u64 llc_size = c->x86_cache_size * 1024ULL;
do_div(llc_size, c->x86_max_cores);
-
- return (int)llc_size;
+ llc_size_per_core = (unsigned int)llc_size;
}
struct microcode_ops * __init init_intel_microcode(void)
@@ -900,7 +655,7 @@ struct microcode_ops * __init init_intel_microcode(void)
return NULL;
}
- llc_size_per_core = calc_llc_size_per_core(c);
+ calc_llc_size_per_core(c);
return &microcode_intel_ops;
}
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index bf883aa71233..21776c529fa9 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -8,43 +8,43 @@
#include <asm/cpu.h>
#include <asm/microcode.h>
-struct ucode_patch {
- struct list_head plist;
- void *data; /* Intel uses only this one */
- unsigned int size;
- u32 patch_id;
- u16 equiv_cpu;
-};
-
-extern struct list_head microcode_cache;
-
struct device;
enum ucode_state {
UCODE_OK = 0,
UCODE_NEW,
+ UCODE_NEW_SAFE,
UCODE_UPDATED,
UCODE_NFOUND,
UCODE_ERROR,
+ UCODE_TIMEOUT,
+ UCODE_OFFLINE,
};
struct microcode_ops {
enum ucode_state (*request_microcode_fw)(int cpu, struct device *dev);
-
void (*microcode_fini_cpu)(int cpu);
/*
- * The generic 'microcode_core' part guarantees that
- * the callbacks below run on a target cpu when they
- * are being called.
+ * The generic 'microcode_core' part guarantees that the callbacks
+ * below run on a target CPU when they are being called.
* See also the "Synchronization" section in microcode_core.c.
*/
- enum ucode_state (*apply_microcode)(int cpu);
- int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
+ enum ucode_state (*apply_microcode)(int cpu);
+ int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
+ void (*finalize_late_load)(int result);
+ unsigned int nmi_safe : 1,
+ use_nmi : 1;
+};
+
+struct early_load_data {
+ u32 old_rev;
+ u32 new_rev;
};
+extern struct early_load_data early_data;
extern struct ucode_cpu_info ucode_cpu_info[];
-struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa);
+struct cpio_data find_microcode_in_initrd(const char *path);
#define MAX_UCODE_COUNT 128
@@ -94,20 +94,19 @@ static inline unsigned int x86_cpuid_family(void)
return x86_family(eax);
}
-extern bool initrd_gone;
+extern bool dis_ucode_ldr;
+extern bool force_minrev;
#ifdef CONFIG_CPU_SUP_AMD
-void load_ucode_amd_bsp(unsigned int family);
+void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family);
void load_ucode_amd_ap(unsigned int family);
-void load_ucode_amd_early(unsigned int cpuid_1_eax);
int save_microcode_in_initrd_amd(unsigned int family);
void reload_ucode_amd(unsigned int cpu);
struct microcode_ops *init_amd_microcode(void);
void exit_amd_microcode(void);
#else /* CONFIG_CPU_SUP_AMD */
-static inline void load_ucode_amd_bsp(unsigned int family) { }
+static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { }
static inline void load_ucode_amd_ap(unsigned int family) { }
-static inline void load_ucode_amd_early(unsigned int family) { }
static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
static inline void reload_ucode_amd(unsigned int cpu) { }
static inline struct microcode_ops *init_amd_microcode(void) { return NULL; }
@@ -115,15 +114,13 @@ static inline void exit_amd_microcode(void) { }
#endif /* !CONFIG_CPU_SUP_AMD */
#ifdef CONFIG_CPU_SUP_INTEL
-void load_ucode_intel_bsp(void);
+void load_ucode_intel_bsp(struct early_load_data *ed);
void load_ucode_intel_ap(void);
-int save_microcode_in_initrd_intel(void);
void reload_ucode_intel(void);
struct microcode_ops *init_intel_microcode(void);
#else /* CONFIG_CPU_SUP_INTEL */
-static inline void load_ucode_intel_bsp(void) { }
+static inline void load_ucode_intel_bsp(struct early_load_data *ed) { }
static inline void load_ucode_intel_ap(void) { }
-static inline int save_microcode_in_initrd_intel(void) { return -EINVAL; }
static inline void reload_ucode_intel(void) { }
static inline struct microcode_ops *init_intel_microcode(void) { return NULL; }
#endif /* !CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e6bba12c759c..01fa06dd06b6 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -262,11 +262,14 @@ static uint32_t __init ms_hyperv_platform(void)
static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
{
static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+ unsigned int old_cpu, this_cpu;
if (!unknown_nmi_panic)
return NMI_DONE;
- if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1)
+ old_cpu = -1;
+ this_cpu = raw_smp_processor_id();
+ if (!atomic_try_cmpxchg(&nmi_cpu, &old_cpu, this_cpu))
return NMI_HANDLED;
return NMI_DONE;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 2d6aa5d2e3d7..d3524778a545 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -428,6 +428,10 @@ void __init mtrr_copy_map(void)
* from the x86_init.hyper.init_platform() hook. It can be called only once.
* The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR
* is cleared.
+ *
+ * @var: MTRR variable range array to use
+ * @num_var: length of the @var array
+ * @def_type: default caching type
*/
void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var,
mtrr_type def_type)
@@ -492,13 +496,15 @@ static u8 type_merge(u8 type, u8 new_type, u8 *uniform)
/**
* mtrr_type_lookup - look up memory type in MTRR
*
+ * @start: Begin of the physical address range
+ * @end: End of the physical address range
+ * @uniform: output argument:
+ * - 1: the returned MTRR type is valid for the whole region
+ * - 0: otherwise
+ *
* Return Values:
* MTRR_TYPE_(type) - The effective MTRR type for the region
* MTRR_TYPE_INVALID - MTRR is disabled
- *
- * Output Argument:
- * uniform - Set to 1 when the returned MTRR type is valid for the whole
- * region, set to 0 else.
*/
u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
{
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 31c0e68f6227..e65fae63660e 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -20,13 +20,13 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
unsigned int cpu)
{
#ifdef CONFIG_SMP
- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+ seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id);
seq_printf(m, "siblings\t: %d\n",
cpumask_weight(topology_core_cpumask(cpu)));
- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+ seq_printf(m, "core id\t\t: %d\n", c->topo.core_id);
seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
- seq_printf(m, "apicid\t\t: %d\n", c->apicid);
- seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
+ seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid);
#endif
}
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 030d3b409768..19e0681f0435 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -152,6 +152,7 @@ static inline void cache_alloc_hsw_probe(void)
r->cache.cbm_len = 20;
r->cache.shareable_bits = 0xc0000;
r->cache.min_cbm_bits = 2;
+ r->cache.arch_has_sparse_bitmasks = false;
r->alloc_capable = true;
rdt_alloc_capable = true;
@@ -267,15 +268,18 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_1_eax eax;
+ union cpuid_0x10_x_ecx ecx;
union cpuid_0x10_x_edx edx;
- u32 ebx, ecx;
+ u32 ebx;
- cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
+ cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
r->cache.cbm_len = eax.split.cbm_len + 1;
r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
r->cache.shareable_bits = ebx & r->default_ctrl;
r->data_width = (r->cache.cbm_len + 3) / 4;
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
r->alloc_capable = true;
}
@@ -872,7 +876,6 @@ static __init void rdt_init_res_defs_intel(void)
if (r->rid == RDT_RESOURCE_L3 ||
r->rid == RDT_RESOURCE_L2) {
- r->cache.arch_has_sparse_bitmaps = false;
r->cache.arch_has_per_cpu_cfg = false;
r->cache.min_cbm_bits = 1;
} else if (r->rid == RDT_RESOURCE_MBA) {
@@ -892,7 +895,7 @@ static __init void rdt_init_res_defs_amd(void)
if (r->rid == RDT_RESOURCE_L3 ||
r->rid == RDT_RESOURCE_L2) {
- r->cache.arch_has_sparse_bitmaps = true;
+ r->cache.arch_has_sparse_bitmasks = true;
r->cache.arch_has_per_cpu_cfg = true;
r->cache.min_cbm_bits = 0;
} else if (r->rid == RDT_RESOURCE_MBA) {
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index b44c487727d4..beccb0e87ba7 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -87,10 +87,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
/*
* Check whether a cache bit mask is valid.
- * For Intel the SDM says:
- * Please note that all (and only) contiguous '1' combinations
- * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
- * Additionally Haswell requires at least two bits set.
+ * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID:
+ * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1
+ * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1
+ *
+ * Haswell does not support a non-contiguous 1s value and additionally
+ * requires at least two bits set.
* AMD allows non-contiguous bitmasks.
*/
static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
@@ -113,8 +115,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
first_bit = find_first_bit(&val, cbm_len);
zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
- /* Are non-contiguous bitmaps allowed? */
- if (!r->cache.arch_has_sparse_bitmaps &&
+ /* Are non-contiguous bitmasks allowed? */
+ if (!r->cache.arch_has_sparse_bitmasks &&
(find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
return false;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 85ceaf9a31ac..a4f1aa15f0a2 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -59,6 +59,7 @@ struct rdt_fs_context {
bool enable_cdpl2;
bool enable_cdpl3;
bool enable_mba_mbps;
+ bool enable_debug;
};
static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
@@ -243,18 +244,17 @@ struct rdtgroup {
*/
#define RFTYPE_INFO BIT(0)
#define RFTYPE_BASE BIT(1)
-#define RF_CTRLSHIFT 4
-#define RF_MONSHIFT 5
-#define RF_TOPSHIFT 6
-#define RFTYPE_CTRL BIT(RF_CTRLSHIFT)
-#define RFTYPE_MON BIT(RF_MONSHIFT)
-#define RFTYPE_TOP BIT(RF_TOPSHIFT)
+#define RFTYPE_CTRL BIT(4)
+#define RFTYPE_MON BIT(5)
+#define RFTYPE_TOP BIT(6)
#define RFTYPE_RES_CACHE BIT(8)
#define RFTYPE_RES_MB BIT(9)
-#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
-#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
-#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
-#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
+#define RFTYPE_DEBUG BIT(10)
+#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
+#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
+#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
+#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
+#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON)
/* List of all resource groups */
extern struct list_head rdt_all_groups;
@@ -270,7 +270,7 @@ void __exit rdtgroup_exit(void);
* @mode: Access mode
* @kf_ops: File operations
* @flags: File specific RFTYPE_FLAGS_* flags
- * @fflags: File specific RF_* or RFTYPE_* flags
+ * @fflags: File specific RFTYPE_* flags
* @seq_show: Show content of the file
* @write: Write to the file
*/
@@ -492,6 +492,15 @@ union cpuid_0x10_3_eax {
unsigned int full;
};
+/* CPUID.(EAX=10H, ECX=ResID).ECX */
+union cpuid_0x10_x_ecx {
+ struct {
+ unsigned int reserved:3;
+ unsigned int noncont:1;
+ } split;
+ unsigned int full;
+};
+
/* CPUID.(EAX=10H, ECX=ResID).EDX */
union cpuid_0x10_x_edx {
struct {
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index ded1fc7cb7cb..f136ac046851 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -30,15 +30,15 @@ struct rmid_entry {
struct list_head list;
};
-/**
- * @rmid_free_lru A least recently used list of free RMIDs
+/*
+ * @rmid_free_lru - A least recently used list of free RMIDs
* These RMIDs are guaranteed to have an occupancy less than the
* threshold occupancy
*/
static LIST_HEAD(rmid_free_lru);
-/**
- * @rmid_limbo_count count of currently unused but (potentially)
+/*
+ * @rmid_limbo_count - count of currently unused but (potentially)
* dirty RMIDs.
* This counts RMIDs that no one is currently using but that
* may have a occupancy value > resctrl_rmid_realloc_threshold. User can
@@ -46,7 +46,7 @@ static LIST_HEAD(rmid_free_lru);
*/
static unsigned int rmid_limbo_count;
-/**
+/*
* @rmid_entry - The entry in the limbo and free lists.
*/
static struct rmid_entry *rmid_ptrs;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 725344048f85..69a1de92384a 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -54,8 +54,13 @@ static struct kernfs_node *kn_mondata;
static struct seq_buf last_cmd_status;
static char last_cmd_status_buf[512];
+static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
+static void rdtgroup_destroy_root(void);
+
struct dentry *debugfs_resctrl;
+static bool resctrl_debug;
+
void rdt_last_cmd_clear(void)
{
lockdep_assert_held(&rdtgroup_mutex);
@@ -696,11 +701,10 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct rdtgroup *rdtgrp;
+ char *pid_str;
int ret = 0;
pid_t pid;
- if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
- return -EINVAL;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
rdtgroup_kn_unlock(of->kn);
@@ -715,7 +719,27 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
goto unlock;
}
- ret = rdtgroup_move_task(pid, rdtgrp, of);
+ while (buf && buf[0] != '\0' && buf[0] != '\n') {
+ pid_str = strim(strsep(&buf, ","));
+
+ if (kstrtoint(pid_str, 0, &pid)) {
+ rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
+ ret = -EINVAL;
+ break;
+ }
+
+ if (pid < 0) {
+ rdt_last_cmd_printf("Invalid pid %d\n", pid);
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = rdtgroup_move_task(pid, rdtgrp, of);
+ if (ret) {
+ rdt_last_cmd_printf("Error while processing task %d\n", pid);
+ break;
+ }
+ }
unlock:
rdtgroup_kn_unlock(of->kn);
@@ -755,6 +779,38 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
return ret;
}
+static int rdtgroup_closid_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ seq_printf(s, "%u\n", rdtgrp->closid);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+static int rdtgroup_rmid_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ seq_printf(s, "%u\n", rdtgrp->mon.rmid);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
#ifdef CONFIG_PROC_CPU_RESCTRL
/*
@@ -895,7 +951,7 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of,
return 0;
}
-/**
+/*
* rdt_bit_usage_show - Display current usage of resources
*
* A domain is a shared resource that can now be allocated differently. Here
@@ -1117,12 +1173,24 @@ static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
}
}
+static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = of->kn->parent->priv;
+ struct rdt_resource *r = s->res;
+
+ seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
+
+ return 0;
+}
+
/**
* __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
* @r: Resource to which domain instance @d belongs.
* @d: The domain instance for which @closid is being tested.
* @cbm: Capacity bitmask being tested.
* @closid: Intended closid for @cbm.
+ * @type: CDP type of @r.
* @exclusive: Only check if overlaps with exclusive resource groups
*
* Checks if provided @cbm intended to be used for @closid on domain
@@ -1209,6 +1277,7 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
/**
* rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
+ * @rdtgrp: Resource group identified through its closid.
*
* An exclusive resource group implies that there should be no sharing of
* its allocated resources. At the time this group is considered to be
@@ -1251,9 +1320,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
return true;
}
-/**
+/*
* rdtgroup_mode_write - Modify the resource group's mode
- *
*/
static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
@@ -1357,12 +1425,11 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
return size;
}
-/**
+/*
* rdtgroup_size_show - Display size in bytes of allocated regions
*
* The "size" file mirrors the layout of the "schemata" file, printing the
* size in bytes of each region instead of the capacity bitmask.
- *
*/
static int rdtgroup_size_show(struct kernfs_open_file *of,
struct seq_file *s, void *v)
@@ -1686,77 +1753,77 @@ static struct rftype res_common_files[] = {
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_last_cmd_status_show,
- .fflags = RF_TOP_INFO,
+ .fflags = RFTYPE_TOP_INFO,
},
{
.name = "num_closids",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_num_closids_show,
- .fflags = RF_CTRL_INFO,
+ .fflags = RFTYPE_CTRL_INFO,
},
{
.name = "mon_features",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_mon_features_show,
- .fflags = RF_MON_INFO,
+ .fflags = RFTYPE_MON_INFO,
},
{
.name = "num_rmids",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_num_rmids_show,
- .fflags = RF_MON_INFO,
+ .fflags = RFTYPE_MON_INFO,
},
{
.name = "cbm_mask",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_default_ctrl_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "min_cbm_bits",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_min_cbm_bits_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "shareable_bits",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_shareable_bits_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "bit_usage",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_bit_usage_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "min_bandwidth",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_min_bw_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
},
{
.name = "bandwidth_gran",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_bw_gran_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
},
{
.name = "delay_linear",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_delay_linear_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
},
/*
* Platform specific which (if any) capabilities are provided by
@@ -1775,7 +1842,7 @@ static struct rftype res_common_files[] = {
.kf_ops = &rdtgroup_kf_single_ops,
.write = max_threshold_occ_write,
.seq_show = max_threshold_occ_show,
- .fflags = RF_MON_INFO | RFTYPE_RES_CACHE,
+ .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
},
{
.name = "mbm_total_bytes_config",
@@ -1817,12 +1884,19 @@ static struct rftype res_common_files[] = {
.fflags = RFTYPE_BASE,
},
{
+ .name = "mon_hw_id",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdtgroup_rmid_show,
+ .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG,
+ },
+ {
.name = "schemata",
.mode = 0644,
.kf_ops = &rdtgroup_kf_single_ops,
.write = rdtgroup_schemata_write,
.seq_show = rdtgroup_schemata_show,
- .fflags = RF_CTRL_BASE,
+ .fflags = RFTYPE_CTRL_BASE,
},
{
.name = "mode",
@@ -1830,14 +1904,28 @@ static struct rftype res_common_files[] = {
.kf_ops = &rdtgroup_kf_single_ops,
.write = rdtgroup_mode_write,
.seq_show = rdtgroup_mode_show,
- .fflags = RF_CTRL_BASE,
+ .fflags = RFTYPE_CTRL_BASE,
},
{
.name = "size",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdtgroup_size_show,
- .fflags = RF_CTRL_BASE,
+ .fflags = RFTYPE_CTRL_BASE,
+ },
+ {
+ .name = "sparse_masks",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_has_sparse_bitmasks_show,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "ctrl_hw_id",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdtgroup_closid_show,
+ .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
},
};
@@ -1852,6 +1940,9 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
lockdep_assert_held(&rdtgroup_mutex);
+ if (resctrl_debug)
+ fflags |= RFTYPE_DEBUG;
+
for (rft = rfts; rft < rfts + len; rft++) {
if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
ret = rdtgroup_add_file(kn, rft);
@@ -1894,7 +1985,7 @@ void __init thread_throttle_mode_init(void)
if (!rft)
return;
- rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB;
+ rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB;
}
void __init mbm_config_rftype_init(const char *config)
@@ -1903,7 +1994,7 @@ void __init mbm_config_rftype_init(const char *config)
rft = rdtgroup_get_rftype_by_name(config);
if (rft)
- rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE;
+ rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE;
}
/**
@@ -2038,21 +2129,21 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
if (IS_ERR(kn_info))
return PTR_ERR(kn_info);
- ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
+ ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
if (ret)
goto out_destroy;
/* loop over enabled controls, these are all alloc_capable */
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
- fflags = r->fflags | RF_CTRL_INFO;
+ fflags = r->fflags | RFTYPE_CTRL_INFO;
ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
if (ret)
goto out_destroy;
}
for_each_mon_capable_rdt_resource(r) {
- fflags = r->fflags | RF_MON_INFO;
+ fflags = r->fflags | RFTYPE_MON_INFO;
sprintf(name, "%s_MON", r->name);
ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
if (ret)
@@ -2271,14 +2362,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
return 0;
}
-static void cdp_disable_all(void)
-{
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
-}
-
/*
* We don't allow rdtgroup directories to be created anywhere
* except the root directory. Thus when looking for the rdtgroup
@@ -2358,19 +2441,47 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
struct rdtgroup *prgrp,
struct kernfs_node **mon_data_kn);
+static void rdt_disable_ctx(void)
+{
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
+ set_mba_sc(false);
+
+ resctrl_debug = false;
+}
+
static int rdt_enable_ctx(struct rdt_fs_context *ctx)
{
int ret = 0;
- if (ctx->enable_cdpl2)
+ if (ctx->enable_cdpl2) {
ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
+ if (ret)
+ goto out_done;
+ }
- if (!ret && ctx->enable_cdpl3)
+ if (ctx->enable_cdpl3) {
ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
+ if (ret)
+ goto out_cdpl2;
+ }
- if (!ret && ctx->enable_mba_mbps)
+ if (ctx->enable_mba_mbps) {
ret = set_mba_sc(true);
+ if (ret)
+ goto out_cdpl3;
+ }
+
+ if (ctx->enable_debug)
+ resctrl_debug = true;
+ return 0;
+
+out_cdpl3:
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+out_cdpl2:
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
+out_done:
return ret;
}
@@ -2463,6 +2574,7 @@ static void schemata_list_destroy(void)
static int rdt_get_tree(struct fs_context *fc)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
+ unsigned long flags = RFTYPE_CTRL_BASE;
struct rdt_domain *dom;
struct rdt_resource *r;
int ret;
@@ -2477,18 +2589,31 @@ static int rdt_get_tree(struct fs_context *fc)
goto out;
}
+ ret = rdtgroup_setup_root(ctx);
+ if (ret)
+ goto out;
+
ret = rdt_enable_ctx(ctx);
- if (ret < 0)
- goto out_cdp;
+ if (ret)
+ goto out_root;
ret = schemata_list_create();
if (ret) {
schemata_list_destroy();
- goto out_mba;
+ goto out_ctx;
}
closid_init();
+ if (rdt_mon_capable)
+ flags |= RFTYPE_MON;
+
+ ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
+ if (ret)
+ goto out_schemata_free;
+
+ kernfs_activate(rdtgroup_default.kn);
+
ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
if (ret < 0)
goto out_schemata_free;
@@ -2543,11 +2668,10 @@ out_info:
kernfs_remove(kn_info);
out_schemata_free:
schemata_list_destroy();
-out_mba:
- if (ctx->enable_mba_mbps)
- set_mba_sc(false);
-out_cdp:
- cdp_disable_all();
+out_ctx:
+ rdt_disable_ctx();
+out_root:
+ rdtgroup_destroy_root();
out:
rdt_last_cmd_clear();
mutex_unlock(&rdtgroup_mutex);
@@ -2559,6 +2683,7 @@ enum rdt_param {
Opt_cdp,
Opt_cdpl2,
Opt_mba_mbps,
+ Opt_debug,
nr__rdt_params
};
@@ -2566,6 +2691,7 @@ static const struct fs_parameter_spec rdt_fs_parameters[] = {
fsparam_flag("cdp", Opt_cdp),
fsparam_flag("cdpl2", Opt_cdpl2),
fsparam_flag("mba_MBps", Opt_mba_mbps),
+ fsparam_flag("debug", Opt_debug),
{}
};
@@ -2591,6 +2717,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
ctx->enable_mba_mbps = true;
return 0;
+ case Opt_debug:
+ ctx->enable_debug = true;
+ return 0;
}
return -EINVAL;
@@ -2618,7 +2747,6 @@ static int rdt_init_fs_context(struct fs_context *fc)
if (!ctx)
return -ENOMEM;
- ctx->kfc.root = rdt_root;
ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
fc->fs_private = &ctx->kfc;
fc->ops = &rdt_fs_context_ops;
@@ -2779,16 +2907,16 @@ static void rdt_kill_sb(struct super_block *sb)
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
- set_mba_sc(false);
+ rdt_disable_ctx();
/*Put everything back to default values. */
for_each_alloc_capable_rdt_resource(r)
reset_all_ctrls(r);
- cdp_disable_all();
rmdir_all_sub();
rdt_pseudo_lock_release();
rdtgroup_default.mode = RDT_MODE_SHAREABLE;
schemata_list_destroy();
+ rdtgroup_destroy_root();
static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
static_branch_disable_cpuslocked(&rdt_mon_enable_key);
static_branch_disable_cpuslocked(&rdt_enable_key);
@@ -3170,8 +3298,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
enum rdt_group_type rtype, struct rdtgroup **r)
{
struct rdtgroup *prdtgrp, *rdtgrp;
+ unsigned long files = 0;
struct kernfs_node *kn;
- uint files = 0;
int ret;
prdtgrp = rdtgroup_kn_lock_live(parent_kn);
@@ -3223,7 +3351,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
goto out_destroy;
}
- files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
+ if (rtype == RDTCTRL_GROUP) {
+ files = RFTYPE_BASE | RFTYPE_CTRL;
+ if (rdt_mon_capable)
+ files |= RFTYPE_MON;
+ } else {
+ files = RFTYPE_BASE | RFTYPE_MON;
+ }
+
ret = rdtgroup_add_files(kn, files);
if (ret) {
rdt_last_cmd_puts("kernfs fill error\n");
@@ -3656,6 +3791,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
seq_puts(seq, ",mba_MBps");
+ if (resctrl_debug)
+ seq_puts(seq, ",debug");
+
return 0;
}
@@ -3666,10 +3804,8 @@ static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
.show_options = rdtgroup_show_options,
};
-static int __init rdtgroup_setup_root(void)
+static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
{
- int ret;
-
rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
KERNFS_ROOT_CREATE_DEACTIVATED |
KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
@@ -3677,6 +3813,20 @@ static int __init rdtgroup_setup_root(void)
if (IS_ERR(rdt_root))
return PTR_ERR(rdt_root);
+ ctx->kfc.root = rdt_root;
+ rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
+
+ return 0;
+}
+
+static void rdtgroup_destroy_root(void)
+{
+ kernfs_destroy_root(rdt_root);
+ rdtgroup_default.kn = NULL;
+}
+
+static void __init rdtgroup_setup_default(void)
+{
mutex_lock(&rdtgroup_mutex);
rdtgroup_default.closid = 0;
@@ -3686,19 +3836,7 @@ static int __init rdtgroup_setup_root(void)
list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
- ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE);
- if (ret) {
- kernfs_destroy_root(rdt_root);
- goto out;
- }
-
- rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
- kernfs_activate(rdtgroup_default.kn);
-
-out:
mutex_unlock(&rdtgroup_mutex);
-
- return ret;
}
static void domain_destroy_mon_state(struct rdt_domain *d)
@@ -3820,13 +3958,11 @@ int __init rdtgroup_init(void)
seq_buf_init(&last_cmd_status, last_cmd_status_buf,
sizeof(last_cmd_status_buf));
- ret = rdtgroup_setup_root();
- if (ret)
- return ret;
+ rdtgroup_setup_default();
ret = sysfs_create_mount_point(fs_kobj, "resctrl");
if (ret)
- goto cleanup_root;
+ return ret;
ret = register_filesystem(&rdt_fs_type);
if (ret)
@@ -3859,8 +3995,6 @@ int __init rdtgroup_init(void)
cleanup_mountpoint:
sysfs_remove_mount_point(fs_kobj, "resctrl");
-cleanup_root:
- kernfs_destroy_root(rdt_root);
return ret;
}
@@ -3870,5 +4004,4 @@ void __exit rdtgroup_exit(void)
debugfs_remove_recursive(debugfs_resctrl);
unregister_filesystem(&rdt_fs_type);
sysfs_remove_mount_point(fs_kobj, "resctrl");
- kernfs_destroy_root(rdt_root);
}
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 91fa70e51004..279148e72459 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
return epc_page;
}
+/*
+ * Ensure the SECS page is not swapped out. Must be called with encl->lock
+ * to protect the enclave states including SECS and ensure the SECS page is
+ * not swapped out again while being used.
+ */
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
+{
+ struct sgx_epc_page *epc_page = encl->secs.epc_page;
+
+ if (!epc_page)
+ epc_page = sgx_encl_eldu(&encl->secs, NULL);
+
+ return epc_page;
+}
+
static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
struct sgx_encl_page *entry)
{
@@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
return entry;
}
- if (!(encl->secs.epc_page)) {
- epc_page = sgx_encl_eldu(&encl->secs, NULL);
- if (IS_ERR(epc_page))
- return ERR_CAST(epc_page);
- }
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
if (IS_ERR(epc_page))
@@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
+ }
+
epc_page = sgx_alloc_epc_page(encl_page, false);
if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 5d390df21440..b65ab214bdf5 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -581,7 +581,7 @@ err_out:
*
* Flush any outstanding enqueued EADD operations and perform EINIT. The
* Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
- * the enclave's MRSIGNER, which is caculated from the provided sigstruct.
+ * the enclave's MRSIGNER, which is calculated from the provided sigstruct.
*
* Return:
* - 0: Success.
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index c3e37eaec8ec..7aaa3652e31d 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -204,6 +204,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
continue;
xa_erase(&vepc->page_array, index);
+ cond_resched();
}
/*
@@ -222,6 +223,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
list_add_tail(&epc_page->list, &secs_pages);
xa_erase(&vepc->page_array, index);
+ cond_resched();
}
/*
@@ -243,6 +245,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
if (sgx_vepc_free_page(epc_page))
list_add_tail(&epc_page->list, &secs_pages);
+ cond_resched();
}
if (!list_empty(&secs_pages))
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 0270925fe013..dc136703566f 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -78,7 +78,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c)
/*
* initial apic id, which also represents 32-bit extended x2apic id.
*/
- c->initial_apicid = edx;
+ c->topo.initial_apicid = edx;
smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
#endif
return 0;
@@ -108,7 +108,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
* Populate HT related information from sub-leaf level 0.
*/
cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
- c->initial_apicid = edx;
+ c->topo.initial_apicid = edx;
core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
@@ -146,20 +146,19 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
die_select_mask = (~(-1 << die_plus_mask_width)) >>
core_plus_mask_width;
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid,
+ c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid,
ht_mask_width) & core_select_mask;
if (die_level_present) {
- c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid,
+ c->topo.die_id = apic->phys_pkg_id(c->topo.initial_apicid,
core_plus_mask_width) & die_select_mask;
}
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid,
- pkg_mask_width);
+ c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, pkg_mask_width);
/*
* Reinit the apicid, now that we have extended initial_apicid.
*/
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+ c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
__max_die_per_package = (die_level_siblings / core_level_siblings);
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index 05fa4ef63490..415564a6523b 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -65,20 +65,6 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
}
-
- if (c->cpuid_level >= 0x00000001) {
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- /*
- * If HTT (EDX[28]) is set EBX[16:23] contain the number of
- * apicids which are reserved per package. Store the resulting
- * shift value for the package management code.
- */
- if (edx & (1U << 28))
- c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
- }
-
}
static void init_zhaoxin(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 587c7743fd21..b6b044356f1b 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -48,27 +48,6 @@ struct crash_memmap_data {
unsigned int type;
};
-/*
- * This is used to VMCLEAR all VMCSs loaded on the
- * processor. And when loading kvm_intel module, the
- * callback function pointer will be assigned.
- *
- * protected by rcu.
- */
-crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
-EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
-
-static inline void cpu_crash_vmclear_loaded_vmcss(void)
-{
- crash_vmclear_fn *do_vmclear_operation = NULL;
-
- rcu_read_lock();
- do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
- if (do_vmclear_operation)
- do_vmclear_operation();
- rcu_read_unlock();
-}
-
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -76,11 +55,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
crash_save_cpu(regs, cpu);
/*
- * VMCLEAR VMCSs loaded on all cpus if needed.
- */
- cpu_crash_vmclear_loaded_vmcss();
-
- /*
* Disable Intel PT to stop its logging
*/
cpu_emergency_stop_pt();
@@ -133,11 +107,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
crash_smp_send_stop();
- /*
- * VMCLEAR VMCSs loaded on this cpu if needed.
- */
- cpu_crash_vmclear_loaded_vmcss();
-
cpu_emergency_disable_virtualization();
/*
@@ -201,7 +170,7 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem)
int ret = 0;
/* Exclude the low 1M because it is always reserved */
- ret = crash_exclude_mem_range(cmem, 0, (1<<20)-1);
+ ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1);
if (ret)
return ret;
@@ -229,8 +198,8 @@ static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
}
/* Prepare elf headers. Return addr and size */
-static int prepare_elf_headers(struct kimage *image, void **addr,
- unsigned long *sz, unsigned long *nr_mem_ranges)
+static int prepare_elf_headers(void **addr, unsigned long *sz,
+ unsigned long *nr_mem_ranges)
{
struct crash_mem *cmem;
int ret;
@@ -252,7 +221,7 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
*nr_mem_ranges = cmem->nr_ranges;
/* By default prepare 64bit headers */
- ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
+ ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
out:
vfree(cmem);
@@ -380,7 +349,7 @@ int crash_load_segments(struct kimage *image)
.buf_max = ULONG_MAX, .top_down = false };
/* Prepare elf headers and add a segment */
- ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz, &pnum);
+ ret = prepare_elf_headers(&kbuf.buffer, &kbuf.bufsz, &pnum);
if (ret)
return ret;
@@ -417,8 +386,8 @@ int crash_load_segments(struct kimage *image)
if (ret)
return ret;
image->elf_load_addr = kbuf.mem;
- pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
+ kexec_dprintk("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
return ret;
}
@@ -483,7 +452,7 @@ void arch_crash_handle_hotplug_event(struct kimage *image)
* Create the new elfcorehdr reflecting the changes to CPU and/or
* memory resources.
*/
- if (prepare_elf_headers(image, &elfbuf, &elfsz, &nr_mem_ranges)) {
+ if (prepare_elf_headers(&elfbuf, &elfsz, &nr_mem_ranges)) {
pr_err("unable to create new elfcorehdr");
goto out;
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 87d38f17ff5c..afd09924094e 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -278,7 +278,7 @@ static void __init dtb_apic_setup(void)
}
#ifdef CONFIG_OF_EARLY_FLATTREE
-static void __init x86_flattree_get_config(void)
+void __init x86_flattree_get_config(void)
{
u32 size, map_len;
void *dt;
@@ -300,14 +300,10 @@ static void __init x86_flattree_get_config(void)
unflatten_and_copy_device_tree();
early_memunmap(dt, map_len);
}
-#else
-static inline void x86_flattree_get_config(void) { }
#endif
void __init x86_dtb_init(void)
{
- x86_flattree_get_config();
-
if (!of_have_populated_dt())
return;
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a6c1867fc7aa..59f4aefc6bc1 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -779,13 +779,13 @@ static int __init check_dev_quirk(int num, int slot, int func)
type = read_pci_config_byte(num, slot, func,
PCI_HEADER_TYPE);
- if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) {
+ if ((type & PCI_HEADER_TYPE_MASK) == PCI_HEADER_TYPE_BRIDGE) {
sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
if (sec > num)
early_pci_scan_bus(sec);
}
- if (!(type & 0x80))
+ if (!(type & PCI_HEADER_TYPE_MFD))
return -1;
return 0;
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index 794e70151203..a06b876bbf2d 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -2,6 +2,7 @@
/*
* x86 FPU bug checks:
*/
+#include <asm/cpufeature.h>
#include <asm/fpu/api.h>
/*
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index a86d37052a64..520deb411a70 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -308,7 +308,7 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
* Must be invoked from KVM after a VMEXIT before enabling interrupts when
* XFD write emulation is disabled. This is required because the guest can
* freely modify XFD and the state at VMEXIT is not guaranteed to be the
- * same as the state on VMENTER. So software state has to be udpated before
+ * same as the state on VMENTER. So software state has to be updated before
* any operation which depends on it can take place.
*
* Note: It can be invoked unconditionally even when write emulation is
@@ -369,14 +369,15 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
- unsigned int size, u32 pkru)
+ unsigned int size, u64 xfeatures, u32 pkru)
{
struct fpstate *kstate = gfpu->fpstate;
union fpregs_state *ustate = buf;
struct membuf mb = { .p = buf, .left = size };
if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
- __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE);
+ __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru,
+ XSTATE_COPY_XSAVE);
} else {
memcpy(&ustate->fxsave, &kstate->regs.fxsave,
sizeof(ustate->fxsave));
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 558076dbde5b..247f2225aa9f 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -274,12 +274,13 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
* Attempt to restore the FPU registers directly from user memory.
* Pagefaults are handled and any errors returned are fatal.
*/
-static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
- bool fx_only, unsigned int size)
+static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
{
struct fpu *fpu = &current->thread.fpu;
int ret;
+ /* Restore enabled features only. */
+ xrestore &= fpu->fpstate->user_xfeatures;
retry:
fpregs_lock();
/* Ensure that XFD is up to date */
@@ -309,7 +310,7 @@ retry:
if (ret != X86_TRAP_PF)
return false;
- if (!fault_in_readable(buf, size))
+ if (!fault_in_readable(buf, fpu->fpstate->user_size))
goto retry;
return false;
}
@@ -339,7 +340,6 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
struct user_i387_ia32_struct env;
bool success, fx_only = false;
union fpregs_state *fpregs;
- unsigned int state_size;
u64 user_xfeatures = 0;
if (use_xsave()) {
@@ -349,17 +349,14 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
return false;
fx_only = !fx_sw_user.magic1;
- state_size = fx_sw_user.xstate_size;
user_xfeatures = fx_sw_user.xfeatures;
} else {
user_xfeatures = XFEATURE_MASK_FPSSE;
- state_size = fpu->fpstate->user_size;
}
if (likely(!ia32_fxstate)) {
/* Restore the FPU registers directly from user memory. */
- return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
- state_size);
+ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only);
}
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index cadf68737e6b..117e74c44e75 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1049,6 +1049,7 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
* __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
* @to: membuf descriptor
* @fpstate: The fpstate buffer from which to copy
+ * @xfeatures: The mask of xfeatures to save (XSAVE mode only)
* @pkru_val: The PKRU value to store in the PKRU component
* @copy_mode: The requested copy mode
*
@@ -1059,7 +1060,8 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
* It supports partial copy but @to.pos always starts from zero.
*/
void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
- u32 pkru_val, enum xstate_copy_mode copy_mode)
+ u64 xfeatures, u32 pkru_val,
+ enum xstate_copy_mode copy_mode)
{
const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
struct xregs_state *xinit = &init_fpstate.regs.xsave;
@@ -1083,7 +1085,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
break;
case XSTATE_COPY_XSAVE:
- header.xfeatures &= fpstate->user_xfeatures;
+ header.xfeatures &= fpstate->user_xfeatures & xfeatures;
break;
}
@@ -1185,6 +1187,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
enum xstate_copy_mode copy_mode)
{
__copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
+ tsk->thread.fpu.fpstate->user_xfeatures,
tsk->thread.pkru, copy_mode);
}
@@ -1536,10 +1539,7 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
fpregs_restore_userregs();
newfps->xfeatures = curfps->xfeatures | xfeatures;
-
- if (!guest_fpu)
- newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
-
+ newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
newfps->xfd = curfps->xfd & ~xfeatures;
/* Do the final updates within the locked region */
@@ -1736,7 +1736,6 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
/**
* fpu_xstate_prctl - xstate permission operations
- * @tsk: Redundant pointer to current
* @option: A subfunction of arch_prctl()
* @arg2: option argument
* Return: 0 if successful; otherwise, an error code
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index a4ecb04d8d64..3518fb26d06b 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -43,7 +43,8 @@ enum xstate_copy_mode {
struct membuf;
extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
- u32 pkru_val, enum xstate_copy_mode copy_mode);
+ u64 xfeatures, u32 pkru_val,
+ enum xstate_copy_mode copy_mode);
extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
enum xstate_copy_mode mode);
extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index 24c1175a47e2..58d9ed50fe61 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -3,10 +3,10 @@
* Copyright (C) 2017 Steven Rostedt, VMware Inc.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/page_types.h>
#include <asm/segment.h>
-#include <asm/export.h>
#include <asm/ftrace.h>
#include <asm/nospec-branch.h>
#include <asm/frame.h>
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 945cfa5f7239..214f30e9f0c0 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -3,12 +3,12 @@
* Copyright (C) 2014 Steven Rostedt, Red Hat Inc
*/
+#include <linux/export.h>
#include <linux/cfi_types.h>
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/ptrace.h>
#include <asm/ftrace.h>
-#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
#include <asm/frame.h>
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 246a609f889b..de001b2146ab 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -19,6 +19,7 @@
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/bios_ebda.h>
+#include <asm/microcode.h>
#include <asm/tlbflush.h>
#include <asm/bootparam_utils.h>
@@ -29,11 +30,33 @@ static void __init i386_default_early_setup(void)
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
}
+#ifdef CONFIG_MICROCODE_INITRD32
+unsigned long __initdata initrd_start_early;
+static pte_t __initdata *initrd_pl2p_start, *initrd_pl2p_end;
+
+static void zap_early_initrd_mapping(void)
+{
+ pte_t *pl2p = initrd_pl2p_start;
+
+ for (; pl2p < initrd_pl2p_end; pl2p++) {
+ *pl2p = (pte_t){ .pte = 0 };
+
+ if (!IS_ENABLED(CONFIG_X86_PAE))
+ *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = (pte_t) {.pte = 0};
+ }
+}
+#else
+static inline void zap_early_initrd_mapping(void) { }
+#endif
+
asmlinkage __visible void __init __noreturn i386_start_kernel(void)
{
/* Make sure IDT is set up before any exception happens */
idt_setup_early_handler();
+ load_ucode_bsp();
+ zap_early_initrd_mapping();
+
cr4_init_shadow();
sanitize_boot_params(&boot_params);
@@ -69,52 +92,83 @@ asmlinkage __visible void __init __noreturn i386_start_kernel(void)
* to the first kernel PMD. Note the upper half of each PMD or PTE are
* always zero at this stage.
*/
-void __init mk_early_pgtbl_32(void);
-void __init mk_early_pgtbl_32(void)
-{
-#ifdef __pa
-#undef __pa
-#endif
-#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET)
- pte_t pte, *ptep;
- int i;
- unsigned long *ptr;
- /* Enough space to fit pagetables for the low memory linear map */
- const unsigned long limit = __pa(_end) +
- (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
#ifdef CONFIG_X86_PAE
- pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd);
-#define SET_PL2(pl2, val) { (pl2).pmd = (val); }
+typedef pmd_t pl2_t;
+#define pl2_base initial_pg_pmd
+#define SET_PL2(val) { .pmd = (val), }
#else
- pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table);
-#define SET_PL2(pl2, val) { (pl2).pgd = (val); }
+typedef pgd_t pl2_t;
+#define pl2_base initial_page_table
+#define SET_PL2(val) { .pgd = (val), }
#endif
- ptep = (pte_t *)__pa(__brk_base);
- pte.pte = PTE_IDENT_ATTR;
-
+static __init __no_stack_protector pte_t init_map(pte_t pte, pte_t **ptep, pl2_t **pl2p,
+ const unsigned long limit)
+{
while ((pte.pte & PTE_PFN_MASK) < limit) {
+ pl2_t pl2 = SET_PL2((unsigned long)*ptep | PDE_IDENT_ATTR);
+ int i;
+
+ **pl2p = pl2;
+ if (!IS_ENABLED(CONFIG_X86_PAE)) {
+ /* Kernel PDE entry */
+ *(*pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
+ }
- SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR);
- *pl2p = pl2;
-#ifndef CONFIG_X86_PAE
- /* Kernel PDE entry */
- *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
-#endif
for (i = 0; i < PTRS_PER_PTE; i++) {
- *ptep = pte;
+ **ptep = pte;
pte.pte += PAGE_SIZE;
- ptep++;
+ (*ptep)++;
}
-
- pl2p++;
+ (*pl2p)++;
}
+ return pte;
+}
+
+void __init __no_stack_protector mk_early_pgtbl_32(void)
+{
+ /* Enough space to fit pagetables for the low memory linear map */
+ unsigned long limit = __pa_nodebug(_end) + (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
+ pte_t pte, *ptep = (pte_t *)__pa_nodebug(__brk_base);
+ struct boot_params __maybe_unused *params;
+ pl2_t *pl2p = (pl2_t *)__pa_nodebug(pl2_base);
+ unsigned long *ptr;
+
+ pte.pte = PTE_IDENT_ATTR;
+ pte = init_map(pte, &ptep, &pl2p, limit);
- ptr = (unsigned long *)__pa(&max_pfn_mapped);
+ ptr = (unsigned long *)__pa_nodebug(&max_pfn_mapped);
/* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */
*ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
- ptr = (unsigned long *)__pa(&_brk_end);
+ ptr = (unsigned long *)__pa_nodebug(&_brk_end);
*ptr = (unsigned long)ptep + PAGE_OFFSET;
-}
+#ifdef CONFIG_MICROCODE_INITRD32
+ /* Running on a hypervisor? */
+ if (native_cpuid_ecx(1) & BIT(31))
+ return;
+
+ params = (struct boot_params *)__pa_nodebug(&boot_params);
+ if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image)
+ return;
+
+ /* Save the virtual start address */
+ ptr = (unsigned long *)__pa_nodebug(&initrd_start_early);
+ *ptr = (pte.pte & PTE_PFN_MASK) + PAGE_OFFSET;
+ *ptr += ((unsigned long)params->hdr.ramdisk_image) & ~PAGE_MASK;
+
+ /* Save PLP2 for cleanup */
+ ptr = (unsigned long *)__pa_nodebug(&initrd_pl2p_start);
+ *ptr = (unsigned long)pl2p + PAGE_OFFSET;
+
+ limit = (unsigned long)params->hdr.ramdisk_image;
+ pte.pte = PTE_IDENT_ATTR | PFN_ALIGN(limit);
+ limit = (unsigned long)params->hdr.ramdisk_image + params->hdr.ramdisk_size;
+
+ init_map(pte, &ptep, &pl2p, limit);
+
+ ptr = (unsigned long *)__pa_nodebug(&initrd_pl2p_end);
+ *ptr = (unsigned long)pl2p + PAGE_OFFSET;
+#endif
+}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 49f7629b17f7..dc0956067944 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -41,6 +41,7 @@
#include <asm/trapnr.h>
#include <asm/sev.h>
#include <asm/tdx.h>
+#include <asm/init.h>
/*
* Manage page tables very early on.
@@ -69,23 +70,21 @@ EXPORT_SYMBOL(vmemmap_base);
/*
* GDT used on the boot CPU before switching to virtual addresses.
*/
-static struct desc_struct startup_gdt[GDT_ENTRIES] = {
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = {
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
};
/*
* Address needs to be set at runtime because it references the startup_gdt
* while the kernel still uses a direct mapping.
*/
-static struct desc_ptr startup_gdt_descr = {
- .size = sizeof(startup_gdt),
+static struct desc_ptr startup_gdt_descr __initdata = {
+ .size = sizeof(startup_gdt)-1,
.address = 0,
};
-#define __head __section(".head.text")
-
static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
{
return ptr - (void *)_text + (void *)physaddr;
@@ -211,7 +210,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* Fixup the physical addresses in the page table */
- pgd = fixup_pointer(&early_top_pgt, physaddr);
+ pgd = fixup_pointer(early_top_pgt, physaddr);
p = pgd + pgd_index(__START_KERNEL_map);
if (la57)
*p = (unsigned long)level4_kernel_pgt;
@@ -220,11 +219,11 @@ unsigned long __head __startup_64(unsigned long physaddr,
*p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
if (la57) {
- p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
+ p4d = fixup_pointer(level4_kernel_pgt, physaddr);
p4d[511] += load_delta;
}
- pud = fixup_pointer(&level3_kernel_pgt, physaddr);
+ pud = fixup_pointer(level3_kernel_pgt, physaddr);
pud[510] += load_delta;
pud[511] += load_delta;
@@ -588,7 +587,7 @@ static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler)
}
/* This runs while still in the direct mapping */
-static void startup_64_load_idt(unsigned long physbase)
+static void __head startup_64_load_idt(unsigned long physbase)
{
struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase);
gate_desc *idt = fixup_pointer(bringup_idt_table, physbase);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c9318993f959..487ac57e2c81 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -8,6 +8,7 @@
*/
.text
+#include <linux/export.h>
#include <linux/threads.h>
#include <linux/init.h>
#include <linux/linkage.h>
@@ -25,7 +26,6 @@
#include <asm/nops.h>
#include <asm/nospec-branch.h>
#include <asm/bootparam.h>
-#include <asm/export.h>
#include <asm/pgtable_32.h>
/* Physical address */
@@ -118,11 +118,6 @@ SYM_CODE_START(startup_32)
movl %eax, pa(olpc_ofw_pgd)
#endif
-#ifdef CONFIG_MICROCODE
- /* Early load ucode on BSP. */
- call load_ucode_bsp
-#endif
-
/* Create early pagetables. */
call mk_early_pgtbl_32
@@ -157,11 +152,6 @@ SYM_FUNC_START(startup_32_smp)
movl %eax,%ss
leal -__PAGE_OFFSET(%ecx),%esp
-#ifdef CONFIG_MICROCODE
- /* Early load ucode on AP. */
- call load_ucode_ap
-#endif
-
.Ldefault_entry:
movl $(CR0_STATE & ~X86_CR0_PG),%eax
movl %eax,%cr0
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index bfe5ec2f4f83..bb8ee1ce6968 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -9,7 +9,7 @@
* Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
*/
-
+#include <linux/export.h>
#include <linux/linkage.h>
#include <linux/threads.h>
#include <linux/init.h>
@@ -22,7 +22,6 @@
#include <asm/percpu.h>
#include <asm/nops.h>
#include "../entry/calling.h"
-#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/apicdef.h>
#include <asm/fixmap.h>
@@ -115,6 +114,28 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ mov %rax, %rdi
+ mov %rax, %r14
+
+ addq phys_base(%rip), %rdi
+
+ /*
+ * For SEV guests: Verify that the C-bit is correct. A malicious
+ * hypervisor could lie about the C-bit position to perform a ROP
+ * attack on the guest by writing to the unencrypted stack and wait for
+ * the next RET instruction.
+ */
+ call sev_verify_cbit
+
+ /*
+ * Restore CR3 value without the phys_base which will be added
+ * below, before writing %cr3.
+ */
+ mov %r14, %rax
+#endif
+
jmp 1f
SYM_CODE_END(startup_64)
@@ -180,10 +201,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movl $0, %ecx
#endif
- /* Enable PAE mode, PGE and LA57 */
- orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ /* Enable PAE mode, PSE, PGE and LA57 */
+ orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
- testl $1, __pgtable_l5_enabled(%rip)
+ testb $1, __pgtable_l5_enabled(%rip)
jz 1f
orl $X86_CR4_LA57, %ecx
1:
@@ -194,21 +215,12 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
addq phys_base(%rip), %rax
/*
- * For SEV guests: Verify that the C-bit is correct. A malicious
- * hypervisor could lie about the C-bit position to perform a ROP
- * attack on the guest by writing to the unencrypted stack and wait for
- * the next RET instruction.
- */
- movq %rax, %rdi
- call sev_verify_cbit
-
- /*
* Switch to new page-table
*
* For the boot CPU this switches to early_top_pgt which still has the
- * indentity mappings present. The secondary CPUs will switch to the
+ * identity mappings present. The secondary CPUs will switch to the
* init_top_pgt here, away from the trampoline_pgd and unmap the
- * indentity mapped ranges.
+ * identity mapped ranges.
*/
movq %rax, %cr3
@@ -256,6 +268,22 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
testl $X2APIC_ENABLE, %eax
jnz .Lread_apicid_msr
+#ifdef CONFIG_X86_X2APIC
+ /*
+ * If system is in X2APIC mode then MMIO base might not be
+ * mapped causing the MMIO read below to fault. Faults can't
+ * be handled at that point.
+ */
+ cmpl $0, x2apic_mode(%rip)
+ jz .Lread_apicid_mmio
+
+ /* Force the AP into X2APIC mode. */
+ orl $X2APIC_ENABLE, %eax
+ wrmsr
+ jmp .Lread_apicid_msr
+#endif
+
+.Lread_apicid_mmio:
/* Read the APIC ID from the fix-mapped MMIO space. */
movq apic_mmio_base(%rip), %rcx
addq $APIC_ID, %rcx
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 1648aa0204d9..a38d0c93a66e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -52,7 +52,7 @@ unsigned long hpet_address;
u8 hpet_blockid; /* OS timer block num */
bool hpet_msi_disable;
-#ifdef CONFIG_GENERIC_MSI_IRQ
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ)
static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel);
static struct irq_domain *hpet_domain;
#endif
@@ -469,7 +469,7 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc)
/*
* HPET MSI Support
*/
-#ifdef CONFIG_GENERIC_MSI_IRQ
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ)
static void hpet_msi_unmask(struct irq_data *data)
{
struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
@@ -707,7 +707,7 @@ static void __init hpet_select_clockevents(void)
hpet_base.nr_clockevents = 0;
- /* No point if MSI is disabled or CPU has an Always Runing APIC Timer */
+ /* No point if MSI is disabled or CPU has an Always Running APIC Timer */
if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT))
return;
@@ -965,7 +965,7 @@ static bool __init mwait_pc10_supported(void)
* and per CPU timer interrupts.
*
* The probability that this problem is going to be solved in the
- * forseeable future is close to zero, so the kernel has to be cluttered
+ * foreseeable future is close to zero, so the kernel has to be cluttered
* with heuristics to keep up with the ever growing amount of hardware and
* firmware trainwrecks. Hopefully some day hardware people will understand
* that the approach of "This can be fixed in software" is not sustainable.
@@ -1438,7 +1438,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
memset(&curr_time, 0, sizeof(struct rtc_time));
if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) {
- if (unlikely(mc146818_get_time(&curr_time) < 0)) {
+ if (unlikely(mc146818_get_time(&curr_time, 10) < 0)) {
pr_err_ratelimited("unable to read current time from RTC\n");
return IRQ_HANDLED;
}
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 30a55207c000..c20d1832c481 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -32,6 +32,7 @@
*/
static void init_8259A(int auto_eoi);
+static bool pcat_compat __ro_after_init;
static int i8259A_auto_eoi;
DEFINE_RAW_SPINLOCK(i8259A_lock);
@@ -299,15 +300,32 @@ static void unmask_8259A(void)
static int probe_8259A(void)
{
+ unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR);
unsigned long flags;
- unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
- unsigned char new_val;
+
+ /*
+ * If MADT has the PCAT_COMPAT flag set, then do not bother probing
+ * for the PIC. Some BIOSes leave the PIC uninitialized and probing
+ * fails.
+ *
+ * Right now this causes problems as quite some code depends on
+ * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly
+ * when the system has an IO/APIC because then PIC is not required
+ * at all, except for really old machines where the timer interrupt
+ * must be routed through the PIC. So just pretend that the PIC is
+ * there and let legacy_pic->init() initialize it for nothing.
+ *
+ * Alternatively this could just try to initialize the PIC and
+ * repeat the probe, but for cases where there is no PIC that's
+ * just pointless.
+ */
+ if (pcat_compat)
+ return nr_legacy_irqs();
+
/*
- * Check to see if we have a PIC.
- * Mask all except the cascade and read
- * back the value we just wrote. If we don't
- * have a PIC, we will read 0xff as opposed to the
- * value we wrote.
+ * Check to see if we have a PIC. Mask all except the cascade and
+ * read back the value we just wrote. If we don't have a PIC, we
+ * will read 0xff as opposed to the value we wrote.
*/
raw_spin_lock_irqsave(&i8259A_lock, flags);
@@ -429,5 +447,9 @@ static int __init i8259A_init_ops(void)
return 0;
}
-
device_initcall(i8259A_init_ops);
+
+void __init legacy_pic_pcat_compat(void)
+{
+ pcat_compat = true;
+}
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index b786d48f5a0f..660b601f1d6c 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -10,6 +10,7 @@
#include <asm/proto.h>
#include <asm/desc.h>
#include <asm/hw_irq.h>
+#include <asm/ia32.h>
#include <asm/idtentry.h>
#define DPL0 0x0
@@ -116,8 +117,11 @@ static const __initconst struct idt_data def_idts[] = {
#endif
SYSG(X86_TRAP_OF, asm_exc_overflow),
+};
+
+static const struct idt_data ia32_idt[] __initconst = {
#if defined(CONFIG_IA32_EMULATION)
- SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
+ SYSG(IA32_SYSCALL_VECTOR, asm_int80_emulation),
#elif defined(CONFIG_X86_32)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
#endif
@@ -225,6 +229,9 @@ void __init idt_setup_early_traps(void)
void __init idt_setup_traps(void)
{
idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
+
+ if (ia32_enabled())
+ idt_setup_from_table(idt_table, ia32_idt, ARRAY_SIZE(ia32_idt), true);
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index aaf9e776f323..7f542a7799cb 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/asm.h>
-#include <asm/export.h>
+#include <linux/export.h>
#include <linux/linkage.h>
/*
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index ee4fe8cdb857..9a7c03d47861 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -74,7 +74,6 @@ static struct ctl_table itmt_kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- {}
};
static struct ctl_table_header *itmt_sysctl_header;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index a61c12c01270..2a422e00ed4b 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -82,7 +82,7 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
cmdline_ptr[cmdline_len - 1] = '\0';
- pr_debug("Final command line is: %s\n", cmdline_ptr);
+ kexec_dprintk("Final command line is: %s\n", cmdline_ptr);
cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
cmdline_ext_32 = cmdline_ptr_phys >> 32;
@@ -272,7 +272,12 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
nr_e820_entries = params->e820_entries;
+ kexec_dprintk("E820 memmap:\n");
for (i = 0; i < nr_e820_entries; i++) {
+ kexec_dprintk("%016llx-%016llx (%d)\n",
+ params->e820_table[i].addr,
+ params->e820_table[i].addr + params->e820_table[i].size - 1,
+ params->e820_table[i].type);
if (params->e820_table[i].type != E820_TYPE_RAM)
continue;
start = params->e820_table[i].addr;
@@ -424,7 +429,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
* command line. Make sure it does not overflow
*/
if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
- pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ pr_err("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
return ERR_PTR(-EINVAL);
}
@@ -445,7 +450,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
return ERR_PTR(ret);
}
- pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+ kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem);
/*
@@ -490,8 +495,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
if (ret)
goto out_free_params;
bootparam_load_addr = kbuf.mem;
- pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
+ kexec_dprintk("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, kbuf.bufsz, kbuf.memsz);
/* Load kernel */
kbuf.buffer = kernel + kern16_size;
@@ -505,8 +510,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
goto out_free_params;
kernel_load_addr = kbuf.mem;
- pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- kernel_load_addr, kbuf.bufsz, kbuf.memsz);
+ kexec_dprintk("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kbuf.bufsz, kbuf.memsz);
/* Load initrd high */
if (initrd) {
@@ -520,8 +525,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
goto out_free_params;
initrd_load_addr = kbuf.mem;
- pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- initrd_load_addr, initrd_len, initrd_len);
+ kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
setup_initrd(params, initrd_load_addr, initrd_len);
}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 3a43a2dee658..9c9faa1634fb 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -695,7 +695,6 @@ void kgdb_arch_exit(void)
}
/**
- *
* kgdb_skipexception - Bail out of KGDB when we've been triggered.
* @exception: Exception vector number
* @regs: Current &struct pt_regs.
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index e8babebad7b8..a0ce46c0a2d8 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -576,7 +576,8 @@ static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs)
{
unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
- int3_emulate_call(regs, regs_get_register(regs, offs));
+ int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + p->ainsn.size);
+ int3_emulate_jmp(regs, regs_get_register(regs, offs));
}
NOKPROBE_SYMBOL(kprobe_emulate_call_indirect);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b8ab9ee5896c..428ee74002e1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -434,7 +434,8 @@ static void __init sev_map_percpu_data(void)
{
int cpu;
- if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ if (cc_vendor != CC_VENDOR_AMD ||
+ !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
for_each_possible_cpu(cpu) {
@@ -500,13 +501,13 @@ static bool pv_sched_yield_supported(void)
static void __send_ipi_mask(const struct cpumask *mask, int vector)
{
unsigned long flags;
- int cpu, apic_id, icr;
- int min = 0, max = 0;
+ int cpu, min = 0, max = 0;
#ifdef CONFIG_X86_64
__uint128_t ipi_bitmap = 0;
#else
u64 ipi_bitmap = 0;
#endif
+ u32 apic_id, icr;
long ret;
if (cpumask_empty(mask))
@@ -803,8 +804,8 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
"setne %al\n\t"
-DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted,
- PV_VCPU_PREEMPTED_ASM, .text);
+DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
+ PV_VCPU_PREEMPTED_ASM, .text);
#endif
static void __init kvm_guest_init(void)
@@ -942,7 +943,7 @@ static void __init kvm_init_platform(void)
* Reset the host's shared pages list related to kernel
* specific page encryption status settings before we load a
* new kernel by kexec. Reset the page encryption status
- * during early boot intead of just before kexec to avoid SMP
+ * during early boot instead of just before kexec to avoid SMP
* races during kvm_pv_guest_cpu_reboot().
* NOTE: We cannot reset the complete shared pages list
* here as we need to retain the UEFI/OVMF firmware
@@ -1028,8 +1029,8 @@ arch_initcall(activate_jump_labels);
/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
static void kvm_kick_cpu(int cpu)
{
- int apicid;
unsigned long flags = 0;
+ u32 apicid;
apicid = per_cpu(x86_cpu_to_apicid, cpu);
kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index fb8f52149be9..5bb395551c44 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -24,8 +24,8 @@
static int kvmclock __initdata = 1;
static int kvmclock_vsyscall __initdata = 1;
-static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
-static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
+static int msr_kvm_system_time __ro_after_init;
+static int msr_kvm_wall_clock __ro_after_init;
static u64 kvm_sched_clock_offset __ro_after_init;
static int __init parse_no_kvmclock(char *arg)
@@ -42,7 +42,7 @@ static int __init parse_no_kvmclock_vsyscall(char *arg)
}
early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
-/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
+/* Aligned to page sizes to match what's mapped via vsyscalls to userspace */
#define HVC_BOOT_ARRAY_SIZE \
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
@@ -195,7 +195,8 @@ static void kvm_setup_secondary_clock(void)
void kvmclock_disable(void)
{
- native_write_msr(msr_kvm_system_time, 0, 0);
+ if (msr_kvm_system_time)
+ native_write_msr(msr_kvm_system_time, 0, 0);
}
static void __init kvmclock_init_mem(void)
@@ -294,7 +295,10 @@ void __init kvmclock_init(void)
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
- } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ } else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+ msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+ } else {
return;
}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index adc67f98819a..7a814b41402d 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -7,7 +7,7 @@
* This handles calls from both 32bit and 64bit mode.
*
* Lock order:
- * contex.ldt_usr_sem
+ * context.ldt_usr_sem
* mmap_lock
* context.lock
*/
@@ -49,7 +49,7 @@ void load_mm_ldt(struct mm_struct *mm)
/*
* Any change to mm->context.ldt is followed by an IPI to all
* CPUs with the mm active. The LDT will not be freed until
- * after the IPI is handled by all such CPUs. This means that,
+ * after the IPI is handled by all such CPUs. This means that
* if the ldt_struct changes before we return, the values we see
* will be safe, and the new values will be loaded before we run
* any user code.
@@ -685,7 +685,7 @@ SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
}
/*
* The SYSCALL_DEFINE() macros give us an 'unsigned long'
- * return type, but tht ABI for sys_modify_ldt() expects
+ * return type, but the ABI for sys_modify_ldt() expects
* 'int'. This cast gives us an int-sized value in %rax
* for the return code. The 'unsigned' is necessary so
* the compiler does not try to sign-extend the negative
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 1a3e2c05a8a5..bc0a5348b4a6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -42,12 +42,9 @@ struct init_pgtable_data {
static int mem_region_callback(struct resource *res, void *arg)
{
struct init_pgtable_data *data = arg;
- unsigned long mstart, mend;
-
- mstart = res->start;
- mend = mstart + resource_size(res) - 1;
- return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend);
+ return kernel_ident_mapping_init(data->info, data->level4p,
+ res->start, res->end + 1);
}
static int
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 5f71a0cf4399..e18914c0e38a 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -276,7 +276,7 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
{
const Elf_Shdr *s, *alt = NULL, *locks = NULL,
- *para = NULL, *orc = NULL, *orc_ip = NULL,
+ *orc = NULL, *orc_ip = NULL,
*retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
*calls = NULL, *cfi = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
@@ -286,8 +286,6 @@ int module_finalize(const Elf_Ehdr *hdr,
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks = s;
- if (!strcmp(".parainstructions", secstrings + s->sh_name))
- para = s;
if (!strcmp(".orc_unwind", secstrings + s->sh_name))
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
@@ -304,14 +302,6 @@ int module_finalize(const Elf_Ehdr *hdr,
ibt_endbr = s;
}
- /*
- * See alternative_instructions() for the ordering rules between the
- * various patching types.
- */
- if (para) {
- void *pseg = (void *)para->sh_addr;
- apply_paravirt(pseg, pseg + para->sh_size);
- }
if (retpolines || cfi) {
void *rseg = NULL, *cseg = NULL;
unsigned int rsize = 0, csize = 0;
@@ -341,7 +331,7 @@ int module_finalize(const Elf_Ehdr *hdr,
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
- if (calls || para) {
+ if (calls || alt) {
struct callthunk_sites cs = {};
if (calls) {
@@ -349,9 +339,9 @@ int module_finalize(const Elf_Ehdr *hdr,
cs.call_end = (void *)calls->sh_addr + calls->sh_size;
}
- if (para) {
- cs.pv_start = (void *)para->sh_addr;
- cs.pv_end = (void *)para->sh_addr + para->sh_size;
+ if (alt) {
+ cs.alt_start = (void *)alt->sh_addr;
+ cs.alt_end = (void *)alt->sh_addr + alt->sh_size;
}
callthunks_patch_module_calls(&cs, me);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index a0c551846b35..17e955ab69fe 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -33,6 +33,7 @@
#include <asm/reboot.h>
#include <asm/cache.h>
#include <asm/nospec-branch.h>
+#include <asm/microcode.h>
#include <asm/sev.h>
#define CREATE_TRACE_POINTS
@@ -343,6 +344,9 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
instrumentation_begin();
+ if (microcode_nmi_handler_enabled() && microcode_nmi_handler())
+ goto out;
+
handled = nmi_handle(NMI_LOCAL, regs);
__this_cpu_add(nmi_stats.normal, handled);
if (handled) {
@@ -498,8 +502,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
raw_atomic_long_inc(&nsp->idt_calls);
- if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
+ if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) {
+ if (microcode_nmi_handler_enabled())
+ microcode_offline_nmi_handler();
return;
+ }
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
this_cpu_write(nmi_state, NMI_LATCHED);
@@ -507,12 +514,13 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
}
this_cpu_write(nmi_state, NMI_EXECUTING);
this_cpu_write(nmi_cr2, read_cr2());
+
+nmi_restart:
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
WARN_ON_ONCE(!(nsp->idt_seq & 0x1));
WRITE_ONCE(nsp->recv_jiffies, jiffies);
}
-nmi_restart:
/*
* Needs to happen before DR7 is accessed, because the hypervisor can
@@ -548,16 +556,16 @@ nmi_restart:
if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
write_cr2(this_cpu_read(nmi_cr2));
- if (this_cpu_dec_return(nmi_state))
- goto nmi_restart;
-
- if (user_mode(regs))
- mds_user_clear_cpu_buffers();
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
WARN_ON_ONCE(nsp->idt_seq & 0x1);
WRITE_ONCE(nsp->recv_jiffies, jiffies);
}
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
+
+ if (user_mode(regs))
+ mds_user_clear_cpu_buffers();
}
#if IS_ENABLED(CONFIG_KVM_INTEL)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 975f98d5eee5..5358d43886ad 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -34,14 +34,8 @@
#include <asm/io_bitmap.h>
#include <asm/gsseg.h>
-/*
- * nop stub, which must not clobber anything *including the stack* to
- * avoid confusing the entry prologues.
- */
-DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text);
-
/* stub always returning 0. */
-DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text);
+DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text);
void __init default_banner(void)
{
@@ -49,26 +43,12 @@ void __init default_banner(void)
pv_info.name);
}
-/* Undefined instruction for dealing with missing ops pointers. */
-noinstr void paravirt_BUG(void)
-{
- BUG();
-}
-
-static unsigned paravirt_patch_call(void *insn_buff, const void *target,
- unsigned long addr, unsigned len)
-{
- __text_gen_insn(insn_buff, CALL_INSN_OPCODE,
- (void *)addr, target, CALL_INSN_SIZE);
- return CALL_INSN_SIZE;
-}
-
#ifdef CONFIG_PARAVIRT_XXL
-DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text);
-DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
+DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text);
+DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
#endif
DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
@@ -85,28 +65,6 @@ static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
tlb_remove_page(tlb, table);
}
-unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr,
- unsigned int len)
-{
- /*
- * Neat trick to map patch type back to the call within the
- * corresponding structure.
- */
- void *opfunc = *((void **)&pv_ops + type);
- unsigned ret;
-
- if (opfunc == NULL)
- /* If there's no function, patch it with paravirt_BUG() */
- ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len);
- else if (opfunc == _paravirt_nop)
- ret = 0;
- else
- /* Otherwise call the function. */
- ret = paravirt_patch_call(insn_buff, opfunc, addr, len);
-
- return ret;
-}
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -143,66 +101,7 @@ int paravirt_disable_iospace(void)
return request_resource(&ioport_resource, &reserve_ioports);
}
-static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
-
-static inline void enter_lazy(enum paravirt_lazy_mode mode)
-{
- BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
-
- this_cpu_write(paravirt_lazy_mode, mode);
-}
-
-static void leave_lazy(enum paravirt_lazy_mode mode)
-{
- BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
-
- this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
-}
-
-void paravirt_enter_lazy_mmu(void)
-{
- enter_lazy(PARAVIRT_LAZY_MMU);
-}
-
-void paravirt_leave_lazy_mmu(void)
-{
- leave_lazy(PARAVIRT_LAZY_MMU);
-}
-
-void paravirt_flush_lazy_mmu(void)
-{
- preempt_disable();
-
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- arch_enter_lazy_mmu_mode();
- }
-
- preempt_enable();
-}
-
#ifdef CONFIG_PARAVIRT_XXL
-void paravirt_start_context_switch(struct task_struct *prev)
-{
- BUG_ON(preemptible());
-
- if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
- }
- enter_lazy(PARAVIRT_LAZY_CPU);
-}
-
-void paravirt_end_context_switch(struct task_struct *next)
-{
- BUG_ON(preemptible());
-
- leave_lazy(PARAVIRT_LAZY_CPU);
-
- if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
- arch_enter_lazy_mmu_mode();
-}
-
static noinstr void pv_native_write_cr2(unsigned long val)
{
native_write_cr2(val);
@@ -229,14 +128,6 @@ static noinstr void pv_native_safe_halt(void)
}
#endif
-enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
-{
- if (in_interrupt())
- return PARAVIRT_LAZY_NONE;
-
- return this_cpu_read(paravirt_lazy_mode);
-}
-
struct pv_info pv_info = {
.name = "bare hardware",
#ifdef CONFIG_PARAVIRT_XXL
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f0909142a0a..ab49ade31b0d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -257,13 +257,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
io_bitmap_share(p);
- /*
- * If copy_thread() if failing, don't leak the shadow stack possibly
- * allocated in shstk_alloc_thread_stack() above.
- */
- if (ret)
- shstk_free(p);
-
return ret;
}
@@ -484,7 +477,7 @@ void native_tss_update_io_bitmap(void)
/*
* Make sure that the TSS limit is covering the IO bitmap. It might have
* been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
- * access from user space to trigger a #GP because tbe bitmap is outside
+ * access from user space to trigger a #GP because the bitmap is outside
* the TSS limit.
*/
refresh_tss_limit();
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 3adbe97015c1..830425e6d38e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -22,7 +22,6 @@
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
#include <asm/pci_x86.h>
-#include <asm/virtext.h>
#include <asm/cpu.h>
#include <asm/nmi.h>
#include <asm/smp.h>
@@ -530,9 +529,54 @@ static inline void kb_wait(void)
static inline void nmi_shootdown_cpus_on_restart(void);
+#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
+/* RCU-protected callback to disable virtualization prior to reboot. */
+static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
+
+void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
+{
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback)))
+ return;
+
+ rcu_assign_pointer(cpu_emergency_virt_callback, callback);
+}
+EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback);
+
+void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
+{
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback))
+ return;
+
+ rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
+
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
+ */
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_virt_cb *callback;
+
+ /*
+ * IRQs must be disabled as KVM enables virtualization in hardware via
+ * function call IPIs, i.e. IRQs need to be disabled to guarantee
+ * virtualization stays disabled.
+ */
+ lockdep_assert_irqs_disabled();
+
+ rcu_read_lock();
+ callback = rcu_dereference(cpu_emergency_virt_callback);
+ if (callback)
+ callback();
+ rcu_read_unlock();
+}
+
static void emergency_reboot_disable_virtualization(void)
{
- /* Just make sure we won't change CPUs while doing this */
local_irq_disable();
/*
@@ -545,7 +589,7 @@ static void emergency_reboot_disable_virtualization(void)
* Do the NMI shootdown even if virtualization is off on _this_ CPU, as
* other CPUs may have virtualization enabled.
*/
- if (cpu_has_vmx() || cpu_has_svm(NULL)) {
+ if (rcu_access_pointer(cpu_emergency_virt_callback)) {
/* Safely force _this_ CPU out of VMX/SVM operation. */
cpu_emergency_disable_virtualization();
@@ -553,7 +597,9 @@ static void emergency_reboot_disable_virtualization(void)
nmi_shootdown_cpus_on_restart();
}
}
-
+#else
+static void emergency_reboot_disable_virtualization(void) { }
+#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */
void __attribute__((weak)) mach_reboot_fixups(void)
{
@@ -787,21 +833,9 @@ void machine_crash_shutdown(struct pt_regs *regs)
}
#endif
-
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
-/*
- * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
- * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
- * GIF=0, i.e. if the crash occurred between CLGI and STGI.
- */
-void cpu_emergency_disable_virtualization(void)
-{
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
-}
-
#if defined(CONFIG_SMP)
static nmi_shootdown_cb shootdown_callback;
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1309b9b05338..2e7066980f3e 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -67,7 +67,7 @@ void mach_get_cmos_time(struct timespec64 *now)
return;
}
- if (mc146818_get_time(&tm)) {
+ if (mc146818_get_time(&tm, 1000)) {
pr_err("Unable to read current time from RTC\n");
now->tv_sec = now->tv_nsec = 0;
return;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b9145a63da77..84201071dfac 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -226,8 +226,6 @@ static void __init reserve_brk(void)
_brk_start = 0;
}
-u64 relocated_ramdisk;
-
#ifdef CONFIG_BLK_DEV_INITRD
static u64 __init get_ramdisk_image(void)
@@ -261,7 +259,7 @@ static void __init relocate_initrd(void)
u64 area_size = PAGE_ALIGN(ramdisk_size);
/* We need to move the initrd down into directly mapped mem */
- relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
+ u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
PFN_PHYS(max_pfn_mapped));
if (!relocated_ramdisk)
panic("Cannot find place for new RAMDISK of size %lld\n",
@@ -358,15 +356,11 @@ static void __init add_early_ima_buffer(u64 phys_addr)
#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE)
int __init ima_free_kexec_buffer(void)
{
- int rc;
-
if (!ima_kexec_buffer_size)
return -ENOENT;
- rc = memblock_phys_free(ima_kexec_buffer_phys,
- ima_kexec_buffer_size);
- if (rc)
- return rc;
+ memblock_free_late(ima_kexec_buffer_phys,
+ ima_kexec_buffer_size);
ima_kexec_buffer_phys = 0;
ima_kexec_buffer_size = 0;
@@ -470,154 +464,29 @@ static void __init memblock_x86_reserve_range_setup_data(void)
}
}
-/*
- * --------- Crashkernel reservation ------------------------------
- */
-
-/* 16M alignment for crash kernel regions */
-#define CRASH_ALIGN SZ_16M
-
-/*
- * Keep the crash kernel below this limit.
- *
- * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
- * due to mapping restrictions.
- *
- * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
- * the upper limit of system RAM in 4-level paging mode. Since the kdump
- * jump could be from 5-level paging to 4-level paging, the jump will fail if
- * the kernel is put above 64 TB, and during the 1st kernel bootup there's
- * no good way to detect the paging mode of the target kernel which will be
- * loaded for dumping.
- */
-#ifdef CONFIG_X86_32
-# define CRASH_ADDR_LOW_MAX SZ_512M
-# define CRASH_ADDR_HIGH_MAX SZ_512M
-#else
-# define CRASH_ADDR_LOW_MAX SZ_4G
-# define CRASH_ADDR_HIGH_MAX SZ_64T
-#endif
-
-static int __init reserve_crashkernel_low(void)
+static void __init arch_reserve_crashkernel(void)
{
-#ifdef CONFIG_X86_64
- unsigned long long base, low_base = 0, low_size = 0;
- unsigned long low_mem_limit;
- int ret;
-
- low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX);
-
- /* crashkernel=Y,low */
- ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base);
- if (ret) {
- /*
- * two parts from kernel/dma/swiotlb.c:
- * -swiotlb size: user-specified with swiotlb= or default.
- *
- * -swiotlb overflow buffer: now hardcoded to 32k. We round it
- * to 8M for other buffers that may need to stay low too. Also
- * make sure we allocate enough extra low memory so that we
- * don't run out of DMA buffers for 32-bit devices.
- */
- low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
- } else {
- /* passed with crashkernel=0,low ? */
- if (!low_size)
- return 0;
- }
-
- low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
- if (!low_base) {
- pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
- (unsigned long)(low_size >> 20));
- return -ENOMEM;
- }
-
- pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n",
- (unsigned long)(low_size >> 20),
- (unsigned long)(low_base >> 20),
- (unsigned long)(low_mem_limit >> 20));
-
- crashk_low_res.start = low_base;
- crashk_low_res.end = low_base + low_size - 1;
- insert_resource(&iomem_resource, &crashk_low_res);
-#endif
- return 0;
-}
-
-static void __init reserve_crashkernel(void)
-{
- unsigned long long crash_size, crash_base, total_mem;
+ unsigned long long crash_base, crash_size, low_size = 0;
+ char *cmdline = boot_command_line;
bool high = false;
int ret;
if (!IS_ENABLED(CONFIG_KEXEC_CORE))
return;
- total_mem = memblock_phys_mem_size();
-
- /* crashkernel=XM */
- ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
- if (ret != 0 || crash_size <= 0) {
- /* crashkernel=X,high */
- ret = parse_crashkernel_high(boot_command_line, total_mem,
- &crash_size, &crash_base);
- if (ret != 0 || crash_size <= 0)
- return;
- high = true;
- }
+ ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
+ &crash_size, &crash_base,
+ &low_size, &high);
+ if (ret)
+ return;
if (xen_pv_domain()) {
pr_info("Ignoring crashkernel for a Xen PV domain\n");
return;
}
- /* 0 means: find the address automatically */
- if (!crash_base) {
- /*
- * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
- * crashkernel=x,high reserves memory over 4G, also allocates
- * 256M extra low memory for DMA buffers and swiotlb.
- * But the extra memory is not required for all machines.
- * So try low memory first and fall back to high memory
- * unless "crashkernel=size[KMG],high" is specified.
- */
- if (!high)
- crash_base = memblock_phys_alloc_range(crash_size,
- CRASH_ALIGN, CRASH_ALIGN,
- CRASH_ADDR_LOW_MAX);
- if (!crash_base)
- crash_base = memblock_phys_alloc_range(crash_size,
- CRASH_ALIGN, CRASH_ALIGN,
- CRASH_ADDR_HIGH_MAX);
- if (!crash_base) {
- pr_info("crashkernel reservation failed - No suitable area found.\n");
- return;
- }
- } else {
- unsigned long long start;
-
- start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base,
- crash_base + crash_size);
- if (start != crash_base) {
- pr_info("crashkernel reservation failed - memory is in use.\n");
- return;
- }
- }
-
- if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
- memblock_phys_free(crash_base, crash_size);
- return;
- }
-
- pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
-
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
- insert_resource(&iomem_resource, &crashk_res);
+ reserve_crashkernel_generic(cmdline, crash_size, crash_base,
+ low_size, high);
}
static struct resource standard_io_resources[] = {
@@ -1124,7 +993,7 @@ void __init setup_arch(char **cmdline_p)
* Needs to run after memblock setup because it needs the physical
* memory size.
*/
- sev_setup_arch();
+ mem_encrypt_setup_arch();
efi_fake_memmap();
efi_find_mirror();
@@ -1162,6 +1031,8 @@ void __init setup_arch(char **cmdline_p)
*
* Moreover, on machines with SandyBridge graphics or in setups that use
* crashkernel the entire 1M is reserved anyway.
+ *
+ * Note the host kernel TDX also requires the first 1MB being reserved.
*/
x86_platform.realmode_reserve();
@@ -1221,6 +1092,8 @@ void __init setup_arch(char **cmdline_p)
early_acpi_boot_init();
+ x86_flattree_get_config();
+
initmem_init();
dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
@@ -1231,7 +1104,7 @@ void __init setup_arch(char **cmdline_p)
* Reserve memory for crash kernel after SRAT is parsed so that it
* won't consume hotpluggable memory.
*/
- reserve_crashkernel();
+ arch_reserve_crashkernel();
memblock_find_dma_reserve();
@@ -1294,7 +1167,7 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
- conswitchp = &vga_con;
+ vgacon_register_screen(&screen_info);
#endif
#endif
x86_init.oem.banner();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2c97bf7b56ae..b30d6e180df7 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -106,8 +106,8 @@ void __init pcpu_populate_pte(unsigned long addr)
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
- struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
- 0xFFFFF);
+ struct desc_struct d = GDT_ENTRY_INIT(DESC_DATA32,
+ per_cpu_offset(cpu), 0xFFFFF);
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
#endif
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 2eabccde94fb..1d24ec679915 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -96,7 +96,7 @@ static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason)
/* Tell the hypervisor what went wrong. */
val |= GHCB_SEV_TERM_REASON(set, reason);
- /* Request Guest Termination from Hypvervisor */
+ /* Request Guest Termination from Hypervisor */
sev_es_wr_ghcb_msr(val);
VMGEXIT();
@@ -256,7 +256,7 @@ static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg)
return 0;
}
-static int sev_cpuid_hv(struct cpuid_leaf *leaf)
+static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf)
{
int ret;
@@ -279,6 +279,45 @@ static int sev_cpuid_hv(struct cpuid_leaf *leaf)
return ret;
}
+static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+ u32 cr4 = native_read_cr4();
+ int ret;
+
+ ghcb_set_rax(ghcb, leaf->fn);
+ ghcb_set_rcx(ghcb, leaf->subfn);
+
+ if (cr4 & X86_CR4_OSXSAVE)
+ /* Safe to read xcr0 */
+ ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
+ else
+ /* xgetbv will cause #UD - use reset value for xcr0 */
+ ghcb_set_xcr0(ghcb, 1);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) &&
+ ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ leaf->eax = ghcb->save.rax;
+ leaf->ebx = ghcb->save.rbx;
+ leaf->ecx = ghcb->save.rcx;
+ leaf->edx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+ return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf)
+ : __sev_cpuid_hv_msr(leaf);
+}
+
/*
* This may be called early while still running on the initial identity
* mapping. Use RIP-relative addressing to obtain the correct address
@@ -388,19 +427,20 @@ snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
return false;
}
-static void snp_cpuid_hv(struct cpuid_leaf *leaf)
+static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
{
- if (sev_cpuid_hv(leaf))
+ if (sev_cpuid_hv(ghcb, ctxt, leaf))
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
}
-static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
+static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ struct cpuid_leaf *leaf)
{
struct cpuid_leaf leaf_hv = *leaf;
switch (leaf->fn) {
case 0x1:
- snp_cpuid_hv(&leaf_hv);
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
/* initial APIC ID */
leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
@@ -419,7 +459,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
break;
case 0xB:
leaf_hv.subfn = 0;
- snp_cpuid_hv(&leaf_hv);
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
/* extended APIC ID */
leaf->edx = leaf_hv.edx;
@@ -467,7 +507,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
}
break;
case 0x8000001E:
- snp_cpuid_hv(&leaf_hv);
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
/* extended APIC ID */
leaf->eax = leaf_hv.eax;
@@ -488,7 +528,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
* Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
* should be treated as fatal by caller.
*/
-static int snp_cpuid(struct cpuid_leaf *leaf)
+static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -522,7 +562,7 @@ static int snp_cpuid(struct cpuid_leaf *leaf)
return 0;
}
- return snp_cpuid_postprocess(leaf);
+ return snp_cpuid_postprocess(ghcb, ctxt, leaf);
}
/*
@@ -544,14 +584,14 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
leaf.fn = fn;
leaf.subfn = subfn;
- ret = snp_cpuid(&leaf);
+ ret = snp_cpuid(NULL, NULL, &leaf);
if (!ret)
goto cpuid_done;
if (ret != -EOPNOTSUPP)
goto fail;
- if (sev_cpuid_hv(&leaf))
+ if (__sev_cpuid_hv_msr(&leaf))
goto fail;
cpuid_done:
@@ -592,6 +632,23 @@ fail:
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
}
+static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt,
+ unsigned long address,
+ bool write)
+{
+ if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_USER;
+ ctxt->fi.cr2 = address;
+ if (write)
+ ctxt->fi.error_code |= X86_PF_WRITE;
+
+ return ES_EXCEPTION;
+ }
+
+ return ES_OK;
+}
+
static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
void *src, char *buf,
unsigned int data_size,
@@ -599,7 +656,12 @@ static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
bool backwards)
{
int i, b = backwards ? -1 : 1;
- enum es_result ret = ES_OK;
+ unsigned long address = (unsigned long)src;
+ enum es_result ret;
+
+ ret = vc_insn_string_check(ctxt, address, false);
+ if (ret != ES_OK)
+ return ret;
for (i = 0; i < count; i++) {
void *s = src + (i * data_size * b);
@@ -620,7 +682,12 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
bool backwards)
{
int i, s = backwards ? -1 : 1;
- enum es_result ret = ES_OK;
+ unsigned long address = (unsigned long)dst;
+ enum es_result ret;
+
+ ret = vc_insn_string_check(ctxt, address, true);
+ if (ret != ES_OK)
+ return ret;
for (i = 0; i < count; i++) {
void *d = dst + (i * data_size * s);
@@ -656,6 +723,9 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
{
struct insn *insn = &ctxt->insn;
+ size_t size;
+ u64 port;
+
*exitinfo = 0;
switch (insn->opcode.bytes[0]) {
@@ -664,7 +734,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
case 0x6d:
*exitinfo |= IOIO_TYPE_INS;
*exitinfo |= IOIO_SEG_ES;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ port = ctxt->regs->dx & 0xffff;
break;
/* OUTS opcodes */
@@ -672,41 +742,43 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
case 0x6f:
*exitinfo |= IOIO_TYPE_OUTS;
*exitinfo |= IOIO_SEG_DS;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ port = ctxt->regs->dx & 0xffff;
break;
/* IN immediate opcodes */
case 0xe4:
case 0xe5:
*exitinfo |= IOIO_TYPE_IN;
- *exitinfo |= (u8)insn->immediate.value << 16;
+ port = (u8)insn->immediate.value & 0xffff;
break;
/* OUT immediate opcodes */
case 0xe6:
case 0xe7:
*exitinfo |= IOIO_TYPE_OUT;
- *exitinfo |= (u8)insn->immediate.value << 16;
+ port = (u8)insn->immediate.value & 0xffff;
break;
/* IN register opcodes */
case 0xec:
case 0xed:
*exitinfo |= IOIO_TYPE_IN;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ port = ctxt->regs->dx & 0xffff;
break;
/* OUT register opcodes */
case 0xee:
case 0xef:
*exitinfo |= IOIO_TYPE_OUT;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ port = ctxt->regs->dx & 0xffff;
break;
default:
return ES_DECODE_FAILED;
}
+ *exitinfo |= port << 16;
+
switch (insn->opcode.bytes[0]) {
case 0x6c:
case 0x6e:
@@ -716,12 +788,15 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
case 0xee:
/* Single byte opcodes */
*exitinfo |= IOIO_DATA_8;
+ size = 1;
break;
default:
/* Length determined by instruction parsing */
*exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16
: IOIO_DATA_32;
+ size = (insn->opnd_bytes == 2) ? 2 : 4;
}
+
switch (insn->addr_bytes) {
case 2:
*exitinfo |= IOIO_ADDR_16;
@@ -737,7 +812,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
if (insn_has_rep_prefix(insn))
*exitinfo |= IOIO_REP;
- return ES_OK;
+ return vc_ioio_check(ctxt, (u16)port, size);
}
static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
@@ -848,14 +923,15 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
return ret;
}
-static int vc_handle_cpuid_snp(struct pt_regs *regs)
+static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
+ struct pt_regs *regs = ctxt->regs;
struct cpuid_leaf leaf;
int ret;
leaf.fn = regs->ax;
leaf.subfn = regs->cx;
- ret = snp_cpuid(&leaf);
+ ret = snp_cpuid(ghcb, ctxt, &leaf);
if (!ret) {
regs->ax = leaf.eax;
regs->bx = leaf.ebx;
@@ -874,7 +950,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
enum es_result ret;
int snp_cpuid_ret;
- snp_cpuid_ret = vc_handle_cpuid_snp(regs);
+ snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt);
if (!snp_cpuid_ret)
return ES_OK;
if (snp_cpuid_ret != -EOPNOTSUPP)
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 2787826d9f60..c67285824e82 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -524,6 +524,33 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt
return ES_OK;
}
+static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
+{
+ BUG_ON(size > 4);
+
+ if (user_mode(ctxt->regs)) {
+ struct thread_struct *t = &current->thread;
+ struct io_bitmap *iobm = t->io_bitmap;
+ size_t idx;
+
+ if (!iobm)
+ goto fault;
+
+ for (idx = port; idx < port + size; ++idx) {
+ if (test_bit(idx, iobm->bitmap))
+ goto fault;
+ }
+ }
+
+ return ES_OK;
+
+fault:
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+
+ return ES_EXCEPTION;
+}
+
/* Include code shared with pre-decompression boot stage */
#include "sev-shared.c"
@@ -868,8 +895,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned long npages)
void snp_accept_memory(phys_addr_t start, phys_addr_t end)
{
- unsigned long vaddr;
- unsigned int npages;
+ unsigned long vaddr, npages;
if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return;
@@ -940,7 +966,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
free_page((unsigned long)vmsa);
}
-static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip)
+static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
{
struct sev_es_save_area *cur_vmsa, *vmsa;
struct ghcb_state state;
@@ -1208,10 +1234,6 @@ void setup_ghcb(void)
if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
return;
- /* First make sure the hypervisor talks a supported protocol. */
- if (!sev_es_negotiate_protocol())
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-
/*
* Check whether the runtime #VC exception handler is active. It uses
* the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
@@ -1229,6 +1251,13 @@ void setup_ghcb(void)
}
/*
+ * Make sure the hypervisor talks a supported protocol.
+ * This gets called only in the BSP boot phase.
+ */
+ if (!sev_es_negotiate_protocol())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ /*
* Clear the boot_ghcb. The first exception comes in before the bss
* section is cleared.
*/
@@ -1509,6 +1538,9 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
return ES_DECODE_FAILED;
}
+ if (user_mode(ctxt->regs))
+ return ES_UNSUPPORTED;
+
switch (mmio) {
case INSN_MMIO_WRITE:
memcpy(ghcb->shared_buffer, reg_data, bytes);
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index fd689921a1db..59e15dd8d0f8 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -205,10 +205,21 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long cl
return 0;
/*
- * For CLONE_VM, except vfork, the child needs a separate shadow
+ * For CLONE_VFORK the child will share the parents shadow stack.
+ * Make sure to clear the internal tracking of the thread shadow
+ * stack so the freeing logic run for child knows to leave it alone.
+ */
+ if (clone_flags & CLONE_VFORK) {
+ shstk->base = 0;
+ shstk->size = 0;
+ return 0;
+ }
+
+ /*
+ * For !CLONE_VM the child will use a copy of the parents shadow
* stack.
*/
- if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM)
+ if (!(clone_flags & CLONE_VM))
return 0;
size = adjust_shstk_size(stack_size);
@@ -408,7 +419,25 @@ void shstk_free(struct task_struct *tsk)
if (!tsk->mm || tsk->mm != current->mm)
return;
+ /*
+ * If shstk->base is NULL, then this task is not managing its
+ * own shadow stack (CLONE_VFORK). So skip freeing it.
+ */
+ if (!shstk->base)
+ return;
+
+ /*
+ * shstk->base is NULL for CLONE_VFORK child tasks, and so is
+ * normal. But size = 0 on a shstk->base is not normal and
+ * indicated an attempt to free the thread shadow stack twice.
+ * Warn about it.
+ */
+ if (WARN_ON(!shstk->size))
+ return;
+
unmap_shadow_stack(shstk->base, shstk->size);
+
+ shstk->size = 0;
}
static int wrss_control(bool enable)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 65fe2094da59..31b6f5dddfc2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -27,6 +27,7 @@
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
#include <linux/syscalls.h>
+#include <linux/rseq.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index cacf2ede6217..23d8aaf8d9fd 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -175,9 +175,6 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
uc_flags = frame_uc_flags(regs);
- if (setup_signal_shadow_stack(ksig))
- return -EFAULT;
-
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
@@ -198,6 +195,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
return -EFAULT;
}
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
/* Set up registers for signal handler */
regs->di = ksig->sig;
/* In case the signal handler was declared without prototypes */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 1bb79526c217..2908e063d7d8 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -131,7 +131,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
}
/*
- * Disable virtualization, APIC etc. and park the CPU in a HLT loop
+ * this function calls the 'stop' function on all other CPUs in the system.
*/
DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
@@ -174,17 +174,13 @@ static void native_stop_other_cpus(int wait)
* 2) Wait for all other CPUs to report that they reached the
* HLT loop in stop_this_cpu()
*
- * 3) If the system uses INIT/STARTUP for CPU bringup, then
- * send all present CPUs an INIT vector, which brings them
- * completely out of the way.
+ * 3) If #2 timed out send an NMI to the CPUs which did not
+ * yet report
*
- * 4) If #3 is not possible and #2 timed out send an NMI to the
- * CPUs which did not yet report
- *
- * 5) Wait for all other CPUs to report that they reached the
+ * 4) Wait for all other CPUs to report that they reached the
* HLT loop in stop_this_cpu()
*
- * #4 can obviously race against a CPU reaching the HLT loop late.
+ * #3 can obviously race against a CPU reaching the HLT loop late.
* That CPU will have reported already and the "have all CPUs
* reached HLT" condition will be true despite the fact that the
* other CPU is still handling the NMI. Again, there is no
@@ -200,7 +196,7 @@ static void native_stop_other_cpus(int wait)
/*
* Don't wait longer than a second for IPI completion. The
* wait request is not checked here because that would
- * prevent an NMI/INIT shutdown in case that not all
+ * prevent an NMI shutdown attempt in case that not all
* CPUs reach shutdown state.
*/
timeout = USEC_PER_SEC;
@@ -208,27 +204,7 @@ static void native_stop_other_cpus(int wait)
udelay(1);
}
- /*
- * Park all other CPUs in INIT including "offline" CPUs, if
- * possible. That's a safe place where they can't resume execution
- * of HLT and then execute the HLT loop from overwritten text or
- * page tables.
- *
- * The only downside is a broadcast MCE, but up to the point where
- * the kexec() kernel brought all APs online again an MCE will just
- * make HLT resume and handle the MCE. The machine crashes and burns
- * due to overwritten text, page tables and data. So there is a
- * choice between fire and frying pan. The result is pretty much
- * the same. Chose frying pan until x86 provides a sane mechanism
- * to park a CPU.
- */
- if (smp_park_other_cpus_in_init())
- goto done;
-
- /*
- * If park with INIT was not possible and the REBOOT_VECTOR didn't
- * take all secondary CPUs offline, try with the NMI.
- */
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
if (!cpumask_empty(&cpus_stop_mask)) {
/*
* If NMI IPI is enabled, try to register the stop handler
@@ -253,7 +229,6 @@ static void native_stop_other_cpus(int wait)
udelay(1);
}
-done:
local_irq_save(flags);
disable_local_APIC();
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d7667a29acf3..3f57ce68a3f1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -87,6 +87,7 @@
#include <asm/hw_irq.h>
#include <asm/stackprotector.h>
#include <asm/sev.h>
+#include <asm/spec-ctrl.h>
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@@ -124,7 +125,20 @@ struct mwait_cpu_dead {
*/
static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
-/* Logical package management. We might want to allocate that dynamically */
+/* Logical package management. */
+struct logical_maps {
+ u32 phys_pkg_id;
+ u32 phys_die_id;
+ u32 logical_pkg_id;
+ u32 logical_die_id;
+};
+
+/* Temporary workaround until the full topology mechanics is in place */
+static DEFINE_PER_CPU_READ_MOSTLY(struct logical_maps, logical_maps) = {
+ .phys_pkg_id = U32_MAX,
+ .phys_die_id = U32_MAX,
+};
+
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
static unsigned int logical_packages __read_mostly;
@@ -258,12 +272,9 @@ static void notrace start_secondary(void *unused)
cpu_init_exception_handling();
/*
- * 32-bit systems load the microcode from the ASM startup code for
- * historical reasons.
- *
- * On 64-bit systems load it before reaching the AP alive
- * synchronization point below so it is not part of the full per
- * CPU serialized bringup part when "parallel" bringup is enabled.
+ * Load the microcode before reaching the AP alive synchronization
+ * point below so it is not part of the full per CPU serialized
+ * bringup part when "parallel" bringup is enabled.
*
* That's even safe when hyperthreading is enabled in the CPU as
* the core code starts the primary threads first and leaves the
@@ -276,8 +287,7 @@ static void notrace start_secondary(void *unused)
* CPUID, MSRs etc. must be strictly serialized to maintain
* software state correctness.
*/
- if (IS_ENABLED(CONFIG_X86_64))
- load_ucode_ap();
+ load_ucode_ap();
/*
* Synchronization point with the hotplug core. Sets this CPUs
@@ -288,7 +298,7 @@ static void notrace start_secondary(void *unused)
cpu_init();
fpu__init_cpu();
- rcu_cpu_starting(raw_smp_processor_id());
+ rcutree_report_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
ap_starting();
@@ -337,10 +347,8 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg)
int cpu;
for_each_possible_cpu(cpu) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if (c->initialized && c->phys_proc_id == phys_pkg)
- return c->logical_proc_id;
+ if (per_cpu(logical_maps.phys_pkg_id, cpu) == phys_pkg)
+ return per_cpu(logical_maps.logical_pkg_id, cpu);
}
return -1;
}
@@ -355,14 +363,12 @@ EXPORT_SYMBOL(topology_phys_to_logical_pkg);
*/
static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
{
- int cpu, proc_id = cpu_data(cur_cpu).phys_proc_id;
+ int cpu, proc_id = cpu_data(cur_cpu).topo.pkg_id;
for_each_possible_cpu(cpu) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if (c->initialized && c->cpu_die_id == die_id &&
- c->phys_proc_id == proc_id)
- return c->logical_die_id;
+ if (per_cpu(logical_maps.phys_pkg_id, cpu) == proc_id &&
+ per_cpu(logical_maps.phys_die_id, cpu) == die_id)
+ return per_cpu(logical_maps.logical_die_id, cpu);
}
return -1;
}
@@ -387,7 +393,9 @@ int topology_update_package_map(unsigned int pkg, unsigned int cpu)
cpu, pkg, new);
}
found:
- cpu_data(cpu).logical_proc_id = new;
+ per_cpu(logical_maps.phys_pkg_id, cpu) = pkg;
+ per_cpu(logical_maps.logical_pkg_id, cpu) = new;
+ cpu_data(cpu).topo.logical_pkg_id = new;
return 0;
}
/**
@@ -410,7 +418,9 @@ int topology_update_die_map(unsigned int die, unsigned int cpu)
cpu, die, new);
}
found:
- cpu_data(cpu).logical_die_id = new;
+ per_cpu(logical_maps.phys_die_id, cpu) = die;
+ per_cpu(logical_maps.logical_die_id, cpu) = new;
+ cpu_data(cpu).topo.logical_die_id = new;
return 0;
}
@@ -421,8 +431,8 @@ static void __init smp_store_boot_cpu_info(void)
*c = boot_cpu_data;
c->cpu_index = id;
- topology_update_package_map(c->phys_proc_id, id);
- topology_update_die_map(c->cpu_die_id, id);
+ topology_update_package_map(c->topo.pkg_id, id);
+ topology_update_die_map(c->topo.die_id, id);
c->initialized = true;
}
@@ -476,21 +486,21 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
- if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_die_id == o->cpu_die_id &&
- per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
- if (c->cpu_core_id == o->cpu_core_id)
+ if (c->topo.pkg_id == o->topo.pkg_id &&
+ c->topo.die_id == o->topo.die_id &&
+ per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) {
+ if (c->topo.core_id == o->topo.core_id)
return topology_sane(c, o, "smt");
- if ((c->cu_id != 0xff) &&
- (o->cu_id != 0xff) &&
- (c->cu_id == o->cu_id))
+ if ((c->topo.cu_id != 0xff) &&
+ (o->topo.cu_id != 0xff) &&
+ (c->topo.cu_id == o->topo.cu_id))
return topology_sane(c, o, "smt");
}
- } else if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_die_id == o->cpu_die_id &&
- c->cpu_core_id == o->cpu_core_id) {
+ } else if (c->topo.pkg_id == o->topo.pkg_id &&
+ c->topo.die_id == o->topo.die_id &&
+ c->topo.core_id == o->topo.core_id) {
return topology_sane(c, o, "smt");
}
@@ -499,8 +509,8 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_die_id == o->cpu_die_id)
+ if (c->topo.pkg_id == o->topo.pkg_id &&
+ c->topo.die_id == o->topo.die_id)
return true;
return false;
}
@@ -510,11 +520,11 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
/* If the arch didn't set up l2c_id, fall back to SMT */
- if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
+ if (per_cpu_l2c_id(cpu1) == BAD_APICID)
return match_smt(c, o);
/* Do not match if L2 cache id does not match: */
- if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
+ if (per_cpu_l2c_id(cpu1) != per_cpu_l2c_id(cpu2))
return false;
return topology_sane(c, o, "l2c");
@@ -527,7 +537,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
*/
static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- if (c->phys_proc_id == o->phys_proc_id)
+ if (c->topo.pkg_id == o->topo.pkg_id)
return true;
return false;
}
@@ -560,11 +570,11 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
bool intel_snc = id && id->driver_data;
/* Do not match if we do not have a valid APICID for cpu: */
- if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
+ if (per_cpu_llc_id(cpu1) == BAD_APICID)
return false;
/* Do not match if LLC id does not match: */
- if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
+ if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2))
return false;
/*
@@ -579,7 +589,6 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
}
-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void)
{
return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
@@ -603,7 +612,14 @@ static int x86_cluster_flags(void)
return cpu_cluster_flags() | x86_sched_itmt_flags();
}
#endif
-#endif
+
+static int x86_die_flags(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return x86_sched_itmt_flags();
+
+ return 0;
+}
/*
* Set if a package/die has multiple NUMA nodes inside.
@@ -634,13 +650,13 @@ static void __init build_sched_topology(void)
};
#endif
/*
- * When there is NUMA topology inside the package skip the DIE domain
+ * When there is NUMA topology inside the package skip the PKG domain
* since the NUMA domains will auto-magically create the right spanning
* domains based on the SLIT.
*/
if (!x86_has_numa_in_package) {
x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, SD_INIT_NAME(DIE)
+ cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG)
};
}
@@ -741,6 +757,7 @@ const struct cpumask *cpu_clustergroup_mask(int cpu)
{
return cpu_l2c_shared_mask(cpu);
}
+EXPORT_SYMBOL_GPL(cpu_clustergroup_mask);
static void impress_friends(void)
{
@@ -803,7 +820,7 @@ static void __init smp_quirk_init_udelay(void)
/*
* Wake up AP by INIT, INIT, STARTUP sequence.
*/
-static void send_init_sequence(int phys_apicid)
+static void send_init_sequence(u32 phys_apicid)
{
int maxlvt = lapic_get_maxlvt();
@@ -829,7 +846,7 @@ static void send_init_sequence(int phys_apicid)
/*
* Wake up AP by INIT, INIT, STARTUP sequence.
*/
-static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
+static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip)
{
unsigned long send_status = 0, accept_status = 0;
int num_starts, j, maxlvt;
@@ -976,7 +993,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
* Returns zero if startup was successfully sent, else error code from
* ->wakeup_secondary_cpu.
*/
-static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
+static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle)
{
unsigned long start_ip = real_mode_header->trampoline_start;
int ret;
@@ -1044,7 +1061,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
{
- int apicid = apic->cpu_present_to_apicid(cpu);
+ u32 apicid = apic->cpu_present_to_apicid(cpu);
int err;
lockdep_assert_irqs_enabled();
@@ -1234,33 +1251,6 @@ void arch_thaw_secondary_cpus_end(void)
cache_aps_init();
}
-bool smp_park_other_cpus_in_init(void)
-{
- unsigned int cpu, this_cpu = smp_processor_id();
- unsigned int apicid;
-
- if (apic->wakeup_secondary_cpu_64 || apic->wakeup_secondary_cpu)
- return false;
-
- /*
- * If this is a crash stop which does not execute on the boot CPU,
- * then this cannot use the INIT mechanism because INIT to the boot
- * CPU will reset the machine.
- */
- if (this_cpu)
- return false;
-
- for_each_present_cpu(cpu) {
- if (cpu == this_cpu)
- continue;
- apicid = apic->cpu_present_to_apicid(cpu);
- if (apicid == BAD_APICID)
- continue;
- send_init_sequence(apicid);
- }
- return true;
-}
-
/*
* Early setup to make printk work.
*/
@@ -1426,7 +1416,7 @@ static void remove_siblinginfo(int cpu)
cpumask_clear(topology_sibling_cpumask(cpu));
cpumask_clear(topology_core_cpumask(cpu));
cpumask_clear(topology_die_cpumask(cpu));
- c->cpu_core_id = 0;
+ c->topo.core_id = 0;
c->booted_cores = 0;
cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
recompute_smt_state();
@@ -1617,8 +1607,15 @@ void __noreturn hlt_play_dead(void)
native_halt();
}
+/*
+ * native_play_dead() is essentially a __noreturn function, but it can't
+ * be marked as such as the compiler may complain about it.
+ */
void native_play_dead(void)
{
+ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ __update_spec_ctrl(0);
+
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index ca004e2e4469..d42c28b8bfd8 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -35,38 +35,9 @@
#include <asm/io_apic.h>
#include <asm/cpu.h>
-static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
-
#ifdef CONFIG_HOTPLUG_CPU
-int arch_register_cpu(int cpu)
+bool arch_cpu_is_hotpluggable(int cpu)
{
- struct x86_cpu *xc = per_cpu_ptr(&cpu_devices, cpu);
-
- xc->cpu.hotpluggable = cpu > 0;
- return register_cpu(&xc->cpu, cpu);
-}
-EXPORT_SYMBOL(arch_register_cpu);
-
-void arch_unregister_cpu(int num)
-{
- unregister_cpu(&per_cpu(cpu_devices, num).cpu);
-}
-EXPORT_SYMBOL(arch_unregister_cpu);
-#else /* CONFIG_HOTPLUG_CPU */
-
-static int __init arch_register_cpu(int num)
-{
- return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
+ return cpu > 0;
}
#endif /* CONFIG_HOTPLUG_CPU */
-
-static int __init topology_init(void)
-{
- int i;
-
- for_each_present_cpu(i)
- arch_register_cpu(i);
-
- return 0;
-}
-subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 78b1d1a6ed2c..4b256de7c58a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,6 +37,7 @@
#include <linux/nmi.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
@@ -565,7 +566,7 @@ static bool fixup_iopl_exception(struct pt_regs *regs)
*/
static bool try_fixup_enqcmd_gp(void)
{
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
u32 pasid;
/*
@@ -591,7 +592,7 @@ static bool try_fixup_enqcmd_gp(void)
if (!mm_valid_pasid(current->mm))
return false;
- pasid = current->mm->pasid;
+ pasid = mm_get_enqcmd_pasid(current->mm);
/*
* Did this thread already have its PASID activated?
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bbc440c93e08..1123ef3ccf90 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -15,6 +15,7 @@
* ( The serial nature of the boot logic and the CPU hotplug lock
* protects against more than 2 CPUs entering this code. )
*/
+#include <linux/workqueue.h>
#include <linux/topology.h>
#include <linux/spinlock.h>
#include <linux/kernel.h>
@@ -342,6 +343,13 @@ static inline unsigned int loop_timeout(int cpu)
return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
}
+static void tsc_sync_mark_tsc_unstable(struct work_struct *work)
+{
+ mark_tsc_unstable("check_tsc_sync_source failed");
+}
+
+static DECLARE_WORK(tsc_sync_work, tsc_sync_mark_tsc_unstable);
+
/*
* The freshly booted CPU initiates this via an async SMP function call.
*/
@@ -395,7 +403,7 @@ retry:
"turning off TSC clock.\n", max_warp);
if (random_warps)
pr_warn("TSC warped randomly between CPUs\n");
- mark_tsc_unstable("check_tsc_sync_source failed");
+ schedule_work(&tsc_sync_work);
}
/*
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 7e574cf3bf8a..d00c28aaa5be 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -85,7 +85,7 @@ static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
{
int *first = ip_table;
int *last = ip_table + num_entries - 1;
- int *mid = first, *found = first;
+ int *mid, *found = first;
if (!num_entries)
return NULL;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e701f2dcea29..9be175c8ac97 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -140,10 +140,7 @@ SECTIONS
STATIC_CALL_TEXT
ALIGN_ENTRY_TEXT_BEGIN
-#ifdef CONFIG_CPU_SRSO
*(.text..__x86.rethunk_untrain)
-#endif
-
ENTRY_TEXT
#ifdef CONFIG_CPU_SRSO
@@ -157,7 +154,7 @@ SECTIONS
ALIGN_ENTRY_TEXT_END
*(.gnu.warning)
- } :text =0xcccc
+ } :text = 0xcccccccc
/* End of text section, which should occupy whole number of pages */
_etext = .;
@@ -271,19 +268,6 @@ SECTIONS
}
#endif
- /*
- * start address and size of operations which during runtime
- * can be patched with virtualization friendly instructions or
- * baremetal native ones. Think page table operations.
- * Details in paravirt_types.h
- */
- . = ALIGN(8);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
- }
-
#ifdef CONFIG_RETPOLINE
/*
* List of instructions that call/jmp/jcc to retpoline thunks
@@ -521,12 +505,12 @@ INIT_PER_CPU(irq_stack_backing_store);
"fixed_percpu_data is not at start of per-cpu area");
#endif
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_CPU_UNRET_ENTRY
. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
-. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
#endif
#ifdef CONFIG_CPU_SRSO
+. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
/*
* GNU ld cannot do XOR until 2.41.
* https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f6f78318fca803c4907fb8d7f6ded8295f1947b1
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 65e96b76c423..d3fc01770558 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -127,7 +127,7 @@ static void __init vsmp_cap_cpus(void)
#endif
}
-static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
+static u32 apicid_phys_pkg_id(u32 initial_apic_id, int index_msb)
{
return read_apic_id() >> index_msb;
}