diff options
Diffstat (limited to 'arch/x86/kernel')
84 files changed, 2388 insertions, 1823 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4070a01c11b7..3269a0e23d3a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -33,11 +33,10 @@ KCSAN_SANITIZE := n KMSAN_SANITIZE_head$(BITS).o := n KMSAN_SANITIZE_nmi.o := n -# If instrumentation of this dir is enabled, boot hangs during first second. -# Probably could be more selective here, but note that files related to irqs, -# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to -# non-deterministic coverage. -KCOV_INSTRUMENT := n +# If instrumentation of the following files is enabled, boot hangs during +# first second. +KCOV_INSTRUMENT_head$(BITS).o := n +KCOV_INSTRUMENT_sev.o := n CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace @@ -49,6 +48,7 @@ obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o +obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o @@ -145,6 +145,10 @@ obj-$(CONFIG_CFI_CLANG) += cfi.o obj-$(CONFIG_CALL_THUNKS) += callthunks.o +obj-$(CONFIG_X86_CET) += cet.o + +obj-$(CONFIG_X86_USER_SHADOW_STACK) += shstk.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 53369c57751e..2a0ea38955df 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -170,7 +170,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) */ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) { - unsigned int ver = 0; int cpu; if (id >= MAX_LOCAL_APIC) { @@ -183,10 +182,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } - if (boot_cpu_physical_apicid != -1U) - ver = boot_cpu_apic_version; - - cpu = generic_processor_info(id, ver); + cpu = generic_processor_info(id); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; @@ -240,7 +236,7 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - if (!apic->apic_id_valid(apic_id)) { + if (!apic_id_valid(apic_id)) { if (enabled) pr_warn("x2apic entry ignored\n"); return 0; @@ -1182,7 +1178,7 @@ static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, acpi_mp_wake_mailbox_paddr = mp_wake->base_address; - acpi_wake_cpu_handler_update(acpi_wakeup_cpu); + apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); return 0; } @@ -1279,7 +1275,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) /* * if "noapic" boot option, don't look for IO-APICs */ - if (skip_ioapic_setup) { + if (ioapic_is_disabled) { pr_info("Skipping IOAPIC probe due to 'noapic' option.\n"); return -ENODEV; } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 099d58d02a26..a5ead6a6d233 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1527,6 +1527,7 @@ static noinline void __init int3_selftest(void) static __initdata int __alt_reloc_selftest_addr; +extern void __init __alt_reloc_selftest(void *arg); __visible noinline void __init __alt_reloc_selftest(void *arg) { WARN_ON(arg != &__alt_reloc_selftest_addr); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 035a3db5330b..356de955e78d 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -24,6 +24,8 @@ #define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 #define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 #define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 +#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a +#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 #define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 @@ -39,6 +41,7 @@ #define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 #define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc +#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 #define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 /* Protect the PCI config register pairs used for SMN. */ @@ -56,6 +59,8 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, {} }; @@ -85,6 +90,8 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, {} }; @@ -106,6 +113,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, {} }; diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index a6fcaf16cdbf..2ee867d796d9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -7,7 +7,7 @@ # In particualr, smp_apic_timer_interrupt() is called in random places. KCOV_INSTRUMENT := n -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index af49e24b46a4..760adac3d1a8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -63,6 +63,8 @@ #include <asm/irq_regs.h> #include <asm/cpu.h> +#include "local.h" + unsigned int num_processors; unsigned disabled_cpus; @@ -74,11 +76,6 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); u8 boot_cpu_apic_version __ro_after_init; /* - * The highest APIC ID seen during enumeration. - */ -static unsigned int max_physical_apicid; - -/* * Bitmask of physically existing CPUs: */ physid_mask_t phys_cpu_present_map; @@ -104,26 +101,20 @@ static bool virt_ext_dest_id __ro_after_init; /* For parallel bootup. */ unsigned long apic_mmio_base __ro_after_init; +static inline bool apic_accessible(void) +{ + return x2apic_mode || apic_mmio_base; +} + /* * Map cpu index to physical APIC ID */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID); DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); #ifdef CONFIG_X86_32 - -/* - * On x86_32, the mapping between cpu and logical apicid may vary - * depending on apic in use. The following early percpu variable is - * used for the mapping. This is where the behaviors of x86_64 and 32 - * actually diverge. Let's keep it ugly for now. - */ -DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); - /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase __ro_after_init; @@ -179,8 +170,8 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif -unsigned long mp_lapic_addr __ro_after_init; -int disable_apic __ro_after_init; +static unsigned long mp_lapic_addr __ro_after_init; +bool apic_is_disabled __ro_after_init; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __initdata; /* Local APIC timer works in C2 */ @@ -206,8 +197,6 @@ unsigned int lapic_timer_period = 0; static void apic_pm_activate(void); -static unsigned long apic_phys __ro_after_init; - /* * Get the LAPIC version */ @@ -247,31 +236,7 @@ static int modern_apic(void) */ static void __init apic_disable(void) { - pr_info("APIC: switched to apic NOOP\n"); - apic = &apic_noop; -} - -void native_apic_wait_icr_idle(void) -{ - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); -} - -u32 native_safe_apic_wait_icr_idle(void) -{ - u32 send_status; - int timeout; - - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - inc_irq_stat(icr_read_retry_count); - udelay(100); - } while (timeout++ < 1000); - - return send_status; + apic_install_driver(&apic_noop); } void native_apic_icr_write(u32 low, u32 id) @@ -537,7 +502,7 @@ static int lapic_timer_set_oneshot(struct clock_event_device *evt) static void lapic_timer_broadcast(const struct cpumask *mask) { #ifdef CONFIG_SMP - apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR); + __apic_send_IPI_mask(mask, LOCAL_TIMER_VECTOR); #endif } @@ -810,7 +775,7 @@ bool __init apic_needs_pit(void) return true; /* Is there an APIC at all or is it disabled? */ - if (!boot_cpu_has(X86_FEATURE_APIC) || disable_apic) + if (!boot_cpu_has(X86_FEATURE_APIC) || apic_is_disabled) return true; /* @@ -1110,7 +1075,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); + apic_eoi(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); @@ -1134,8 +1099,7 @@ void clear_local_APIC(void) int maxlvt; u32 v; - /* APIC hasn't been mapped yet */ - if (!x2apic_mode && !apic_phys) + if (!apic_accessible()) return; maxlvt = lapic_get_maxlvt(); @@ -1225,8 +1189,7 @@ void apic_soft_disable(void) */ void disable_local_APIC(void) { - /* APIC hasn't been mapped yet */ - if (!x2apic_mode && !apic_phys) + if (!apic_accessible()) return; apic_soft_disable(); @@ -1299,7 +1262,7 @@ enum apic_intr_mode_id apic_intr_mode __ro_after_init; static int __init __apic_intr_mode_select(void) { /* Check kernel option */ - if (disable_apic) { + if (apic_is_disabled) { pr_info("APIC disabled via kernel command line\n"); return APIC_PIC; } @@ -1308,7 +1271,7 @@ static int __init __apic_intr_mode_select(void) #ifdef CONFIG_X86_64 /* On 64-bit, the APIC must be integrated, Check local APIC only */ if (!boot_cpu_has(X86_FEATURE_APIC)) { - disable_apic = 1; + apic_is_disabled = true; pr_info("APIC disabled by BIOS\n"); return APIC_PIC; } @@ -1317,16 +1280,15 @@ static int __init __apic_intr_mode_select(void) /* Neither 82489DX nor integrated APIC ? */ if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) { - disable_apic = 1; + apic_is_disabled = true; return APIC_PIC; } /* If the BIOS pretends there is an integrated APIC ? */ if (!boot_cpu_has(X86_FEATURE_APIC) && APIC_INTEGRATED(boot_cpu_apic_version)) { - disable_apic = 1; - pr_err(FW_BUG "Local APIC %d not detected, force emulation\n", - boot_cpu_physical_apicid); + apic_is_disabled = true; + pr_err(FW_BUG "Local APIC not detected, force emulation\n"); return APIC_PIC; } #endif @@ -1347,12 +1309,6 @@ static int __init __apic_intr_mode_select(void) pr_info("APIC: SMP mode deactivated\n"); return APIC_SYMMETRIC_IO_NO_ROUTING; } - - if (read_apic_id() != boot_cpu_physical_apicid) { - panic("Boot APIC ID in local APIC unexpected (%d vs %d)", - read_apic_id(), boot_cpu_physical_apicid); - /* Or can we switch back to PIC here? */ - } #endif return APIC_SYMMETRIC_IO; @@ -1439,7 +1395,9 @@ void __init apic_intr_mode_init(void) break; } - default_setup_apic_routing(); + x86_64_probe_apic(); + + x86_32_install_bigsmp(); if (x86_platform.apic_post_init) x86_platform.apic_post_init(); @@ -1521,7 +1479,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) * per set bit. */ for_each_set_bit(bit, isr->map, APIC_IR_BITS) - ack_APIC_irq(); + apic_eoi(); return true; } @@ -1533,7 +1491,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) * interrupt from previous kernel might still have ISR bit set. * * Most probably by now the CPU has serviced that pending interrupt and it - * might not have done the ack_APIC_irq() because it thought, interrupt + * might not have done the apic_eoi() because it thought, interrupt * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear * the ISR bit and cpu thinks it has already serviced the interrupt. Hence * a vector might get locked. It was noticed for timer irq (vector @@ -1567,7 +1525,7 @@ static void setup_local_APIC(void) int cpu = smp_processor_id(); unsigned int value; - if (disable_apic) { + if (apic_is_disabled) { disable_ioapic_support(); return; } @@ -1589,36 +1547,18 @@ static void setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - /* - * Double-check whether this APIC is really registered. - * This is meaningless in clustered apic mode, so we skip it. - */ - BUG_ON(!apic->apic_id_registered()); + /* Validate that the APIC is registered if required */ + BUG_ON(apic->apic_id_registered && !apic->apic_id_registered()); /* * Intel recommends to set DFR, LDR and TPR before enabling * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... + * document number 292116). + * + * Except for APICs which operate in physical destination mode. */ - apic->init_apic_ldr(); - -#ifdef CONFIG_X86_32 - if (apic->dest_mode_logical) { - int logical_apicid, ldr_apicid; - - /* - * APIC LDR is initialized. If logical_apicid mapping was - * initialized during get_smp_config(), make sure it matches - * the actual value. - */ - logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - if (logical_apicid != BAD_APICID) - WARN_ON(logical_apicid != ldr_apicid); - /* Always use the value from LDR. */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; - } -#endif + if (apic->init_apic_ldr) + apic->init_apic_ldr(); /* * Set Task Priority to 'accept all except vectors 0-31'. An APIC @@ -1691,7 +1631,7 @@ static void setup_local_APIC(void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!cpu && (pic_mode || !value || skip_ioapic_setup)) { + if (!cpu && (pic_mode || !value || ioapic_is_disabled)) { value = APIC_DM_EXTINT; apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); } else { @@ -1748,6 +1688,25 @@ void apic_ap_setup(void) end_local_APIC_setup(); } +static __init void cpu_set_boot_apic(void); + +static __init void apic_read_boot_cpu_id(bool x2apic) +{ + /* + * This can be invoked from check_x2apic() before the APIC has been + * selected. But that code knows for sure that the BIOS enabled + * X2APIC. + */ + if (x2apic) { + boot_cpu_physical_apicid = native_apic_msr_read(APIC_ID); + boot_cpu_apic_version = GET_APIC_VERSION(native_apic_msr_read(APIC_LVR)); + } else { + boot_cpu_physical_apicid = read_apic_id(); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); + } + cpu_set_boot_apic(); +} + #ifdef CONFIG_X86_X2APIC int x2apic_mode; EXPORT_SYMBOL_GPL(x2apic_mode); @@ -1847,6 +1806,8 @@ void x2apic_setup(void) __x2apic_enable(); } +static __init void apic_set_fixmap(void); + static __init void x2apic_disable(void) { u32 x2apic_id, state = x2apic_state; @@ -1867,7 +1828,7 @@ static __init void x2apic_disable(void) } __x2apic_disable(); - register_lapic_address(mp_lapic_addr); + apic_set_fixmap(); } static __init void x2apic_enable(void) @@ -1928,6 +1889,7 @@ void __init check_x2apic(void) x2apic_state = X2APIC_ON_LOCKED; else x2apic_state = X2APIC_ON; + apic_read_boot_cpu_id(true); } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { x2apic_state = X2APIC_DISABLED; } @@ -1943,7 +1905,7 @@ void __init check_x2apic(void) pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n"); pr_err("Disabling APIC, expect reduced performance and functionality.\n"); - disable_apic = 1; + apic_is_disabled = true; setup_clear_cpu_cap(X86_FEATURE_APIC); } @@ -1956,7 +1918,7 @@ void __init enable_IR_x2apic(void) unsigned long flags; int ret, ir_stat; - if (skip_ioapic_setup) { + if (ioapic_is_disabled) { pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); return; } @@ -1994,19 +1956,19 @@ void __init enable_IR_x2apic(void) * On AMD64 we trust the BIOS - if it says no APIC it is likely * not correctly set up (usually the APIC timer won't work etc.) */ -static int __init detect_init_APIC(void) +static bool __init detect_init_APIC(void) { if (!boot_cpu_has(X86_FEATURE_APIC)) { pr_info("No local APIC present\n"); - return -1; + return false; } - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - return 0; + register_lapic_address(APIC_DEFAULT_PHYS_BASE); + return true; } #else -static int __init apic_verify(void) +static bool __init apic_verify(unsigned long addr) { u32 features, h, l; @@ -2017,28 +1979,28 @@ static int __init apic_verify(void) features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { pr_warn("Could not enable APIC!\n"); - return -1; + return false; } set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; /* The BIOS may have set up the APIC at some other address */ if (boot_cpu_data.x86 >= 6) { rdmsr(MSR_IA32_APICBASE, l, h); if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + addr = l & MSR_IA32_APICBASE_BASE; } + register_lapic_address(addr); pr_info("Found and enabled local APIC!\n"); - return 0; + return true; } -int __init apic_force_enable(unsigned long addr) +bool __init apic_force_enable(unsigned long addr) { u32 h, l; - if (disable_apic) - return -1; + if (apic_is_disabled) + return false; /* * Some BIOSes disable the local APIC in the APIC_BASE @@ -2055,17 +2017,17 @@ int __init apic_force_enable(unsigned long addr) enabled_via_apicbase = 1; } } - return apic_verify(); + return apic_verify(addr); } /* * Detect and initialize APIC */ -static int __init detect_init_APIC(void) +static bool __init detect_init_APIC(void) { /* Disabled by kernel option? */ - if (disable_apic) - return -1; + if (apic_is_disabled) + return false; switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -2092,22 +2054,22 @@ static int __init detect_init_APIC(void) if (!force_enable_local_apic) { pr_info("Local APIC disabled by BIOS -- " "you can enable it with \"lapic\"\n"); - return -1; + return false; } - if (apic_force_enable(APIC_DEFAULT_PHYS_BASE)) - return -1; + if (!apic_force_enable(APIC_DEFAULT_PHYS_BASE)) + return false; } else { - if (apic_verify()) - return -1; + if (!apic_verify(APIC_DEFAULT_PHYS_BASE)) + return false; } apic_pm_activate(); - return 0; + return true; no_apic: pr_info("No local APIC present or hardware disabled\n"); - return -1; + return false; } #endif @@ -2116,64 +2078,38 @@ no_apic: */ void __init init_apic_mappings(void) { - unsigned int new_apicid; - if (apic_validate_deadline_timer()) pr_info("TSC deadline timer available\n"); - if (x2apic_mode) { - boot_cpu_physical_apicid = read_apic_id(); + if (x2apic_mode) return; - } - /* If no local APIC can be found return early */ - if (!smp_found_config && detect_init_APIC()) { - /* lets NOP'ify apic operations */ - pr_info("APIC: disable apic facility\n"); - apic_disable(); - } else { - apic_phys = mp_lapic_addr; - - /* - * If the system has ACPI MADT tables or MP info, the LAPIC - * address is already registered. - */ - if (!acpi_lapic && !smp_found_config) - register_lapic_address(apic_phys); + if (!smp_found_config) { + if (!detect_init_APIC()) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } + num_processors = 1; } +} - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - new_apicid = read_apic_id(); - if (boot_cpu_physical_apicid != new_apicid) { - boot_cpu_physical_apicid = new_apicid; - /* - * yeah -- we lie about apic_version - * in case if apic was disabled via boot option - * but it's not a problem for SMP compiled kernel - * since apic_intr_mode_select is prepared for such - * a case and disable smp mode - */ - boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); - } +static __init void apic_set_fixmap(void) +{ + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + apic_mmio_base = APIC_BASE; + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + apic_mmio_base, mp_lapic_addr); + apic_read_boot_cpu_id(false); } void __init register_lapic_address(unsigned long address) { + /* This should only happen once */ + WARN_ON_ONCE(mp_lapic_addr); mp_lapic_addr = address; - if (!x2apic_mode) { - set_fixmap_nocache(FIX_APIC_BASE, address); - apic_mmio_base = APIC_BASE; - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, address); - } - if (boot_cpu_physical_apicid == -1U) { - boot_cpu_physical_apicid = read_apic_id(); - boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); - } + if (!x2apic_mode) + apic_set_fixmap(); } /* @@ -2210,7 +2146,7 @@ static noinline void handle_spurious_interrupt(u8 vector) if (v & (1 << (vector & 0x1f))) { pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n", vector, smp_processor_id()); - ack_APIC_irq(); + apic_eoi(); } else { pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n", vector, smp_processor_id()); @@ -2261,7 +2197,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); v = apic_read(APIC_ESR); - ack_APIC_irq(); + apic_eoi(); atomic_inc(&irq_err_count); apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", @@ -2446,54 +2382,43 @@ static int allocate_logical_cpuid(int apicid) return nr_logical_cpuids++; } -int generic_processor_info(int apicid, int version) +static void cpu_update_apic(int cpu, int apicid) { - int cpu, max = nr_cpu_ids; - bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, - phys_cpu_present_map); +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) + early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; +#endif + set_cpu_possible(cpu, true); + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; - /* - * boot_cpu_physical_apicid is designed to have the apicid - * returned by read_apic_id(), i.e, the apicid of the - * currently booting-up processor. However, on some platforms, - * it is temporarily modified by the apicid reported as BSP - * through MP table. Concretely: - * - * - arch/x86/kernel/mpparse.c: MP_processor_info() - * - arch/x86/mm/amdtopology.c: amd_numa_init() - * - * This function is executed with the modified - * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel - * parameter doesn't work to disable APs on kdump 2nd kernel. - * - * Since fixing handling of boot_cpu_physical_apicid requires - * another discussion and tests on each platform, we leave it - * for now and here we use read_apic_id() directly in this - * function, generic_processor_info(). - */ - if (disabled_cpu_apicid != BAD_APICID && - disabled_cpu_apicid != read_apic_id() && - disabled_cpu_apicid == apicid) { - int thiscpu = num_processors + disabled_cpus; + if (system_state != SYSTEM_BOOTING) + cpu_mark_primary_thread(cpu, apicid); +} - pr_warn("APIC: Disabling requested cpu." - " Processor %d/0x%x ignored.\n", thiscpu, apicid); +static __init void cpu_set_boot_apic(void) +{ + cpuid_to_apicid[0] = boot_cpu_physical_apicid; + cpu_update_apic(0, boot_cpu_physical_apicid); + x86_32_probe_bigsmp_early(); +} - disabled_cpus++; - return -ENODEV; - } +int generic_processor_info(int apicid) +{ + int cpu, max = nr_cpu_ids; - /* - * If boot cpu has not been detected yet, then only allow upto - * nr_cpu_ids - 1 processors and keep one slot free for boot cpu - */ - if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 && - apicid != boot_cpu_physical_apicid) { - int thiscpu = max + disabled_cpus - 1; + /* The boot CPU must be set before MADT/MPTABLE parsing happens */ + if (cpuid_to_apicid[0] == BAD_APICID) + panic("Boot CPU APIC not registered yet\n"); - pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost" - " reached. Keeping one slot for boot cpu." - " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + if (apicid == boot_cpu_physical_apicid) + return 0; + + if (disabled_cpu_apicid == apicid) { + int thiscpu = num_processors + disabled_cpus; + + pr_warn("APIC: Disabling requested cpu. Processor %d/0x%x ignored.\n", + thiscpu, apicid); disabled_cpus++; return -ENODEV; @@ -2509,66 +2434,16 @@ int generic_processor_info(int apicid, int version) return -EINVAL; } - if (apicid == boot_cpu_physical_apicid) { - /* - * x86_bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - * boot_cpu_init() already hold bit 0 in cpu_present_mask - * for BSP. - */ - cpu = 0; - - /* Logical cpuid 0 is reserved for BSP. */ - cpuid_to_apicid[0] = apicid; - } else { - cpu = allocate_logical_cpuid(apicid); - if (cpu < 0) { - disabled_cpus++; - return -EINVAL; - } - } - - /* - * Validate version - */ - if (version == 0x0) { - pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", - cpu, apicid); - version = 0x10; - } - - if (version != boot_cpu_apic_version) { - pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", - boot_cpu_apic_version, cpu, version); + cpu = allocate_logical_cpuid(apicid); + if (cpu < 0) { + disabled_cpus++; + return -EINVAL; } - if (apicid > max_physical_apicid) - max_physical_apicid = apicid; - -#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) - early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; - early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; -#endif -#ifdef CONFIG_X86_32 - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = - apic->x86_32_early_logical_apicid(cpu); -#endif - set_cpu_possible(cpu, true); - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - num_processors++; - - if (system_state != SYSTEM_BOOTING) - cpu_mark_primary_thread(cpu, apicid); - + cpu_update_apic(cpu, apicid); return cpu; } -int hard_smp_processor_id(void) -{ - return read_apic_id(); -} void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, bool dmar) @@ -2610,47 +2485,10 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) } EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); -#ifdef CONFIG_X86_64 -void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler) -{ - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) - (*drv)->wakeup_secondary_cpu_64 = handler; -} -#endif - -/* - * Override the generic EOI implementation with an optimized version. - * Only called during early boot when only one CPU is active and with - * interrupts disabled, so we know this does not race with actual APIC driver - * use. - */ -void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) -{ - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { - /* Should happen once for each apic */ - WARN_ON((*drv)->eoi_write == eoi_write); - (*drv)->native_eoi_write = (*drv)->eoi_write; - (*drv)->eoi_write = eoi_write; - } -} - static void __init apic_bsp_up_setup(void) { #ifdef CONFIG_X86_64 apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); -#else - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -# ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = read_apic_id(); -# endif #endif physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); } @@ -2919,7 +2757,7 @@ int apic_is_clustered_box(void) */ static int __init setup_disableapic(char *arg) { - disable_apic = 1; + apic_is_disabled = true; setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } @@ -2956,11 +2794,11 @@ early_param("nolapic_timer", parse_nolapic_timer); static int __init apic_set_verbosity(char *arg) { if (!arg) { -#ifdef CONFIG_X86_64 - skip_ioapic_setup = 0; + if (IS_ENABLED(CONFIG_X86_32)) + return -EINVAL; + + ioapic_is_disabled = false; return 0; -#endif - return -EINVAL; } if (strcmp("debug", arg) == 0) @@ -2981,11 +2819,11 @@ early_param("apic", apic_set_verbosity); static int __init lapic_insert_resource(void) { - if (!apic_phys) + if (!apic_mmio_base) return -1; /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; + lapic_resource.start = apic_mmio_base; lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; insert_resource(&iomem_resource, &lapic_resource); diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 02b4839478b1..7bc5d9bf59cd 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -6,6 +6,8 @@ #include <linux/irq.h> #include <asm/apic.h> +#include "local.h" + u32 apic_default_calc_apicid(unsigned int cpu) { return per_cpu(x86_cpu_to_apicid, cpu); @@ -29,18 +31,27 @@ void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) int default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) - return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); + return (int)per_cpu(x86_cpu_to_apicid, mps_cpu); else return BAD_APICID; } EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); -int default_check_phys_apicid_present(int phys_apicid) +bool default_apic_id_registered(void) { - return physid_isset(phys_apicid, phys_cpu_present_map); + return physid_isset(read_apic_id(), phys_cpu_present_map); } -int default_apic_id_valid(u32 apicid) +/* + * Set up the logical destination ID when the APIC operates in logical + * destination mode. + */ +void default_init_apic_ldr(void) { - return (apicid < 255); + unsigned long val; + + apic_write(APIC_DFR, APIC_DFR_FLAT); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); + apic_write(APIC_LDR, val); } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 8f72b4351c9f..032a84e2c3cc 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -28,26 +28,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -void flat_init_apic_ldr(void) -{ - unsigned long val; - unsigned long num, id; - - num = smp_processor_id(); - id = 1UL << num; - apic_write(APIC_DFR, APIC_DFR_FLAT); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(id); - apic_write(APIC_LDR, val); -} - static void _flat_send_IPI_mask(unsigned long mask, int vector) { unsigned long flags; @@ -86,16 +66,6 @@ static u32 set_apic_id(unsigned int id) return (id & 0xFF) << 24; } -static unsigned int read_xapic_id(void) -{ - return flat_get_apic_id(apic_read(APIC_ID)); -} - -static int flat_apic_id_registered(void) -{ - return physid_isset(read_xapic_id(), phys_cpu_present_map); -} - static int flat_phys_pkg_id(int initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; @@ -110,23 +80,18 @@ static struct apic apic_flat __ro_after_init = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = flat_apic_id_registered, + .apic_id_registered = default_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, + .init_apic_ldr = default_init_apic_ldr, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = flat_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, @@ -139,15 +104,13 @@ static struct apic apic_flat __ro_after_init = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; /* @@ -178,22 +141,9 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static void physflat_init_apic_ldr(void) -{ - /* - * LDR and DFR are not involved in physflat mode, rather: - * "In physical destination mode, the destination processor is - * specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1) - */ -} - static int physflat_probe(void) { - if (apic == &apic_physflat || num_possible_cpus() > 8 || - jailhouse_paravirt()) - return 1; - - return 0; + return apic == &apic_physflat || num_possible_cpus() > 8 || jailhouse_paravirt(); } static struct apic apic_physflat __ro_after_init = { @@ -201,8 +151,7 @@ static struct apic apic_physflat __ro_after_init = { .name = "physical flat", .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = flat_apic_id_registered, + .apic_id_registered = default_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, @@ -210,14 +159,11 @@ static struct apic apic_physflat __ro_after_init = { .disable_esr = 0, .check_apicid_used = NULL, - .init_apic_ldr = physflat_init_apic_ldr, .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = flat_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, @@ -230,15 +176,13 @@ static struct apic apic_physflat __ro_after_init = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; /* diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index fe78319e0f7a..966d7cf10b95 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -8,92 +8,42 @@ * Though in case if apic is disabled (for some reason) we try * to not uglify the caller's code and allow to call (some) apic routines * like self-ipi, etc... + * + * FIXME: Remove this gunk. The above argument which was intentionally left + * in place is silly to begin with because none of the callbacks except for + * APIC::read/write() have a WARN_ON_ONCE() in them. Sigh... */ #include <linux/cpumask.h> #include <linux/thread_info.h> #include <asm/apic.h> -static void noop_init_apic_ldr(void) { } static void noop_send_IPI(int cpu, int vector) { } static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_allbutself(int vector) { } static void noop_send_IPI_all(int vector) { } static void noop_send_IPI_self(int vector) { } -static void noop_apic_wait_icr_idle(void) { } static void noop_apic_icr_write(u32 low, u32 id) { } - -static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) -{ - return -1; -} - -static u32 noop_safe_apic_wait_icr_idle(void) -{ - return 0; -} - -static u64 noop_apic_icr_read(void) -{ - return 0; -} - -static int noop_phys_pkg_id(int cpuid_apic, int index_msb) -{ - return 0; -} - -static unsigned int noop_get_apic_id(unsigned long x) -{ - return 0; -} - -static int noop_probe(void) -{ - /* - * NOOP apic should not ever be - * enabled via probe routine - */ - return 0; -} - -static int noop_apic_id_registered(void) -{ - /* - * if we would be really "pedantic" - * we should pass read_apic_id() here - * but since NOOP suppose APIC ID = 0 - * lets save a few cycles - */ - return physid_isset(0, phys_cpu_present_map); -} +static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { return -1; } +static u64 noop_apic_icr_read(void) { return 0; } +static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; } +static unsigned int noop_get_apic_id(unsigned long x) { return 0; } +static void noop_apic_eoi(void) { } static u32 noop_apic_read(u32 reg) { - WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled); return 0; } -static void noop_apic_write(u32 reg, u32 v) -{ - WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); -} - -#ifdef CONFIG_X86_32 -static int noop_x86_32_early_logical_apicid(int cpu) +static void noop_apic_write(u32 reg, u32 val) { - return BAD_APICID; + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled); } -#endif struct apic apic_noop __ro_after_init = { .name = "noop", - .probe = noop_probe, - .acpi_madt_oem_check = NULL, - - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = noop_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, @@ -101,18 +51,13 @@ struct apic apic_noop __ro_after_init = { .disable_esr = 0, .check_apicid_used = default_check_apicid_used, - .init_apic_ldr = noop_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, - - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = noop_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = noop_get_apic_id, - .set_apic_id = NULL, .calc_dest_apicid = apic_flat_calc_apicid, @@ -125,17 +70,9 @@ struct apic apic_noop __ro_after_init = { .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, - .inquire_remote_apic = NULL, - .read = noop_apic_read, .write = noop_apic_write, - .eoi_write = noop_apic_write, + .eoi = noop_apic_eoi, .icr_read = noop_apic_icr_read, .icr_write = noop_apic_icr_write, - .wait_icr_idle = noop_apic_wait_icr_idle, - .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, - -#ifdef CONFIG_X86_32 - .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, -#endif }; diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a54d817eb4b6..63f3d7be9dc7 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -56,17 +56,6 @@ static u32 numachip2_set_apic_id(unsigned int id) return id << 24; } -static int numachip_apic_id_valid(u32 apicid) -{ - /* Trust what bootloader passes in MADT */ - return 1; -} - -static int numachip_apic_id_registered(void) -{ - return 1; -} - static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; @@ -228,38 +217,20 @@ static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -/* APIC IPIs are queued */ -static void numachip_apic_wait_icr_idle(void) -{ -} - -/* APIC NMI IPIs are queued */ -static u32 numachip_safe_apic_wait_icr_idle(void) -{ - return 0; -} - static const struct apic apic_numachip1 __refconst = { .name = "NumaConnect system", .probe = numachip1_probe, .acpi_madt_oem_check = numachip1_acpi_madt_oem_check, - .apic_id_valid = numachip_apic_id_valid, - .apic_id_registered = numachip_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = numachip_phys_pkg_id, + .max_apic_id = UINT_MAX, .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, @@ -273,15 +244,12 @@ static const struct apic apic_numachip1 __refconst = { .send_IPI_self = numachip_send_IPI_self, .wakeup_secondary_cpu = numachip_wakeup_secondary, - .inquire_remote_apic = NULL, /* REMRD not supported */ .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = numachip_apic_wait_icr_idle, - .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, }; apic_driver(apic_numachip1); @@ -290,23 +258,16 @@ static const struct apic apic_numachip2 __refconst = { .name = "NumaConnect2 system", .probe = numachip2_probe, .acpi_madt_oem_check = numachip2_acpi_madt_oem_check, - .apic_id_valid = numachip_apic_id_valid, - .apic_id_registered = numachip_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = numachip_phys_pkg_id, + .max_apic_id = UINT_MAX, .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, @@ -320,15 +281,12 @@ static const struct apic apic_numachip2 __refconst = { .send_IPI_self = numachip_send_IPI_self, .wakeup_secondary_cpu = numachip_wakeup_secondary, - .inquire_remote_apic = NULL, /* REMRD not supported */ .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = numachip_apic_wait_icr_idle, - .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, }; apic_driver(apic_numachip2); diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 77555f66c14d..0e5535add4b5 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -18,56 +18,17 @@ static unsigned bigsmp_get_apic_id(unsigned long x) return (x >> 24) & 0xFF; } -static int bigsmp_apic_id_registered(void) -{ - return 1; -} - static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return false; } -static int bigsmp_early_logical_apicid(int cpu) -{ - /* on bigsmp, logical apicid is the same as physical */ - return early_per_cpu(x86_cpu_to_apicid, cpu); -} - -/* - * bigsmp enables physical destination mode - * and doesn't use LDR and DFR - */ -static void bigsmp_init_apic_ldr(void) -{ -} - -static void bigsmp_setup_apic_routing(void) -{ - printk(KERN_INFO - "Enabling APIC mode: Physflat. Using %d I/O APICs\n", - nr_ioapics); -} - -static int bigsmp_cpu_present_to_apicid(int mps_cpu) -{ - if (mps_cpu < nr_cpu_ids) - return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); - - return BAD_APICID; -} - static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ physids_promote(0xFFL, retmap); } -static int bigsmp_check_phys_apicid_present(int phys_apicid) -{ - return 1; -} - static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; @@ -111,21 +72,13 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { static int probe_bigsmp(void) { - if (def_to_bigsmp) - dmi_bigsmp = 1; - else - dmi_check_system(bigsmp_dmi_table); - - return dmi_bigsmp; + return dmi_check_system(bigsmp_dmi_table); } static struct apic apic_bigsmp __ro_after_init = { .name = "bigsmp", .probe = probe_bigsmp, - .acpi_madt_oem_check = NULL, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = bigsmp_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, @@ -133,14 +86,11 @@ static struct apic apic_bigsmp __ro_after_init = { .disable_esr = 1, .check_apicid_used = bigsmp_check_apicid_used, - .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, - .setup_apic_routing = bigsmp_setup_apic_routing, - .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, - .check_phys_apicid_present = bigsmp_check_phys_apicid_present, + .cpu_present_to_apicid = default_cpu_present_to_apicid, .phys_pkg_id = bigsmp_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, @@ -153,37 +103,24 @@ static struct apic apic_bigsmp __ro_after_init = { .send_IPI_all = bigsmp_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; -void __init generic_bigsmp_probe(void) +bool __init apic_bigsmp_possible(bool cmdline_override) { - unsigned int cpu; - - if (!probe_bigsmp()) - return; - - apic = &apic_bigsmp; - - for_each_possible_cpu(cpu) { - if (early_per_cpu(x86_cpu_to_logical_apicid, - cpu) == BAD_APICID) - continue; - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = - bigsmp_early_logical_apicid(cpu); - } + return apic == &apic_bigsmp || !cmdline_override; +} - pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name); +void __init apic_bigsmp_force(void) +{ + if (apic != &apic_bigsmp) + apic_install_driver(&apic_bigsmp); } apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 34a992e275ef..45af535c44a0 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -21,6 +21,8 @@ #include <linux/init.h> #include <linux/delay.h> +#include "local.h" + #ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { @@ -31,12 +33,12 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #ifdef arch_trigger_cpumask_backtrace static void nmi_raise_cpu_backtrace(cpumask_t *mask) { - apic->send_IPI_mask(mask, NMI_VECTOR); + __apic_send_IPI_mask(mask, NMI_VECTOR); } -void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu) { - nmi_trigger_cpumask_backtrace(mask, exclude_self, + nmi_trigger_cpumask_backtrace(mask, exclude_cpu, nmi_raise_cpu_backtrace); } diff --git a/arch/x86/kernel/apic/init.c b/arch/x86/kernel/apic/init.c new file mode 100644 index 000000000000..821e2e536f19 --- /dev/null +++ b/arch/x86/kernel/apic/init.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define pr_fmt(fmt) "APIC: " fmt + +#include <asm/apic.h> + +#include "local.h" + +/* + * Use DEFINE_STATIC_CALL_NULL() to avoid having to provide stub functions + * for each callback. The callbacks are setup during boot and all except + * wait_icr_idle() must be initialized before usage. The IPI wrappers + * use static_call() and not static_call_cond() to catch any fails. + */ +#define DEFINE_APIC_CALL(__cb) \ + DEFINE_STATIC_CALL_NULL(apic_call_##__cb, *apic->__cb) + +DEFINE_APIC_CALL(eoi); +DEFINE_APIC_CALL(native_eoi); +DEFINE_APIC_CALL(icr_read); +DEFINE_APIC_CALL(icr_write); +DEFINE_APIC_CALL(read); +DEFINE_APIC_CALL(send_IPI); +DEFINE_APIC_CALL(send_IPI_mask); +DEFINE_APIC_CALL(send_IPI_mask_allbutself); +DEFINE_APIC_CALL(send_IPI_allbutself); +DEFINE_APIC_CALL(send_IPI_all); +DEFINE_APIC_CALL(send_IPI_self); +DEFINE_APIC_CALL(wait_icr_idle); +DEFINE_APIC_CALL(wakeup_secondary_cpu); +DEFINE_APIC_CALL(wakeup_secondary_cpu_64); +DEFINE_APIC_CALL(write); + +EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_mask); +EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_self); + +/* The container for function call overrides */ +struct apic_override __x86_apic_override __initdata; + +#define apply_override(__cb) \ + if (__x86_apic_override.__cb) \ + apic->__cb = __x86_apic_override.__cb + +static __init void restore_override_callbacks(void) +{ + apply_override(eoi); + apply_override(native_eoi); + apply_override(write); + apply_override(read); + apply_override(send_IPI); + apply_override(send_IPI_mask); + apply_override(send_IPI_mask_allbutself); + apply_override(send_IPI_allbutself); + apply_override(send_IPI_all); + apply_override(send_IPI_self); + apply_override(icr_read); + apply_override(icr_write); + apply_override(wakeup_secondary_cpu); + apply_override(wakeup_secondary_cpu_64); +} + +#define update_call(__cb) \ + static_call_update(apic_call_##__cb, *apic->__cb) + +static __init void update_static_calls(void) +{ + update_call(eoi); + update_call(native_eoi); + update_call(write); + update_call(read); + update_call(send_IPI); + update_call(send_IPI_mask); + update_call(send_IPI_mask_allbutself); + update_call(send_IPI_allbutself); + update_call(send_IPI_all); + update_call(send_IPI_self); + update_call(icr_read); + update_call(icr_write); + update_call(wait_icr_idle); + update_call(wakeup_secondary_cpu); + update_call(wakeup_secondary_cpu_64); +} + +void __init apic_setup_apic_calls(void) +{ + /* Ensure that the default APIC has native_eoi populated */ + apic->native_eoi = apic->eoi; + update_static_calls(); + pr_info("Static calls initialized\n"); +} + +void __init apic_install_driver(struct apic *driver) +{ + if (apic == driver) + return; + + apic = driver; + + if (IS_ENABLED(CONFIG_X86_X2APIC) && apic->x2apic_set_max_apicid) + apic->max_apic_id = x2apic_max_apicid; + + /* Copy the original eoi() callback as KVM/HyperV might overwrite it */ + if (!apic->native_eoi) + apic->native_eoi = apic->eoi; + + /* Apply any already installed callback overrides */ + restore_override_callbacks(); + update_static_calls(); + + pr_info("Switched APIC routing to: %s\n", driver->name); +} diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4241dc243aa8..00da6cf6b07d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -178,7 +178,7 @@ int mp_bus_id_to_type[MAX_MP_BUSSES]; DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -int skip_ioapic_setup; +bool ioapic_is_disabled __ro_after_init; /** * disable_ioapic_support() - disables ioapic support at runtime @@ -189,7 +189,7 @@ void disable_ioapic_support(void) noioapicquirk = 1; noioapicreroute = -1; #endif - skip_ioapic_setup = 1; + ioapic_is_disabled = true; } static int __init parse_noapic(char *str) @@ -831,7 +831,7 @@ static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity) { int ioapic, pin, idx; - if (skip_ioapic_setup) + if (ioapic_is_disabled) return -1; ioapic = mp_find_ioapic(gsi); @@ -1366,7 +1366,7 @@ void __init enable_IO_APIC(void) int i8259_apic, i8259_pin; int apic, pin; - if (skip_ioapic_setup) + if (ioapic_is_disabled) nr_ioapics = 0; if (!nr_legacy_irqs() || !nr_ioapics) @@ -1511,13 +1511,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) physid_set(i, phys_id_present_map); ioapics[ioapic_idx].mp_config.apicid = i; } else { - physid_mask_t tmp; - apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx), - &tmp); - apic_printk(APIC_VERBOSE, "Setting %d in the " - "phys_id_present_map\n", - mpc_ioapic_id(ioapic_idx)); - physids_or(phys_id_present_map, phys_id_present_map, tmp); + apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n", + mpc_ioapic_id(ioapic_idx)); + physid_set(mpc_ioapic_id(ioapic_idx), phys_id_present_map); } /* @@ -1827,7 +1823,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) * We must acknowledge the irq before we move it or the acknowledge will * not propagate properly. */ - ack_APIC_irq(); + apic_eoi(); /* * Tail end of clearing remote IRR bit (either by delivering the EOI @@ -2050,7 +2046,7 @@ static void unmask_lapic_irq(struct irq_data *data) static void ack_lapic_irq(struct irq_data *data) { - ack_APIC_irq(); + apic_eoi(); } static struct irq_chip lapic_chip __read_mostly = { @@ -2095,7 +2091,7 @@ static inline void __init unlock_ExtINT_logic(void) entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); - apic_id = hard_smp_processor_id(); + apic_id = read_apic_id(); memset(&entry1, 0, sizeof(entry1)); entry1.dest_mode_logical = true; @@ -2399,7 +2395,7 @@ void __init setup_IO_APIC(void) { int ioapic; - if (skip_ioapic_setup || !nr_ioapics) + if (ioapic_is_disabled || !nr_ioapics) return; io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; @@ -2546,7 +2542,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) apic_id = i; } - apic->apicid_to_cpu_present(apic_id, &tmp); + physid_set_mask_of_physid(apic_id, &tmp); physids_or(apic_id_map, apic_id_map, tmp); if (reg_00.bits.ID != apic_id) { @@ -2715,7 +2711,7 @@ void __init io_apic_init_mappings(void) "address found in MPTABLE, " "disabling IO/APIC support!\n"); smp_found_config = 0; - skip_ioapic_setup = 1; + ioapic_is_disabled = true; goto fake_ioapic_page; } #endif diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 2a6509e8c840..a44ba7209ef3 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> +#include <linux/delay.h> #include <linux/smp.h> + #include <asm/io_apic.h> #include "local.h" @@ -52,9 +54,9 @@ void apic_send_IPI_allbutself(unsigned int vector) return; if (static_branch_likely(&apic_use_ipi_shorthand)) - apic->send_IPI_allbutself(vector); + __apic_send_IPI_allbutself(vector); else - apic->send_IPI_mask_allbutself(cpu_online_mask, vector); + __apic_send_IPI_mask_allbutself(cpu_online_mask, vector); } /* @@ -68,12 +70,12 @@ void native_smp_send_reschedule(int cpu) WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); return; } - apic->send_IPI(cpu, RESCHEDULE_VECTOR); + __apic_send_IPI(cpu, RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); + __apic_send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); } void native_send_call_func_ipi(const struct cpumask *mask) @@ -85,14 +87,14 @@ void native_send_call_func_ipi(const struct cpumask *mask) goto sendmask; if (cpumask_test_cpu(cpu, mask)) - apic->send_IPI_all(CALL_FUNCTION_VECTOR); + __apic_send_IPI_all(CALL_FUNCTION_VECTOR); else if (num_online_cpus() > 1) - apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR); + __apic_send_IPI_allbutself(CALL_FUNCTION_VECTOR); return; } sendmask: - apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + __apic_send_IPI_mask(mask, CALL_FUNCTION_VECTOR); } #endif /* CONFIG_SMP */ @@ -102,74 +104,77 @@ static inline int __prepare_ICR2(unsigned int mask) return SET_XAPIC_DEST_FIELD(mask); } -static inline void __xapic_wait_icr_idle(void) +u32 apic_mem_wait_icr_idle_timeout(void) +{ + int cnt; + + for (cnt = 0; cnt < 1000; cnt++) { + if (!(apic_read(APIC_ICR) & APIC_ICR_BUSY)) + return 0; + inc_irq_stat(icr_read_retry_count); + udelay(100); + } + return APIC_ICR_BUSY; +} + +void apic_mem_wait_icr_idle(void) { while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY) cpu_relax(); } -void __default_send_IPI_shortcut(unsigned int shortcut, int vector) +/* + * This is safe against interruption because it only writes the lower 32 + * bits of the APIC_ICR register. The destination field is ignored for + * short hand IPIs. + * + * wait_icr_idle() + * write(ICR2, dest) + * NMI + * wait_icr_idle() + * write(ICR) + * wait_icr_idle() + * write(ICR) + * + * This function does not need to disable interrupts as there is no ICR2 + * interaction. The memory write is direct except when the machine is + * affected by the 11AP Pentium erratum, which turns the plain write into + * an XCHG operation. + */ +static void __default_send_IPI_shortcut(unsigned int shortcut, int vector) { /* - * Subtle. In the case of the 'never do double writes' workaround - * we have to lock out interrupts to be safe. As we don't care - * of the value read we use an atomic rmw access to avoid costly - * cli/sti. Otherwise we use an even cheaper single atomic write - * to the APIC. - */ - unsigned int cfg; - - /* - * Wait for idle. + * Wait for the previous ICR command to complete. Use + * safe_apic_wait_icr_idle() for the NMI vector as there have been + * issues where otherwise the system hangs when the panic CPU tries + * to stop the others before launching the kdump kernel. */ if (unlikely(vector == NMI_VECTOR)) - safe_apic_wait_icr_idle(); + apic_mem_wait_icr_idle_timeout(); else - __xapic_wait_icr_idle(); + apic_mem_wait_icr_idle(); - /* - * No need to touch the target chip field. Also the destination - * mode is ignored when a shorthand is used. - */ - cfg = __prepare_ICR(shortcut, vector, 0); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); + /* Destination field (ICR2) and the destination mode are ignored */ + native_apic_mem_write(APIC_ICR, __prepare_ICR(shortcut, vector, 0)); } /* * This is used to send an IPI with no shorthand notation (the destination is * specified in bits 56 to 63 of the ICR). */ -void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) +void __default_send_IPI_dest_field(unsigned int dest_mask, int vector, + unsigned int dest_mode) { - unsigned long cfg; - - /* - * Wait for idle. - */ + /* See comment in __default_send_IPI_shortcut() */ if (unlikely(vector == NMI_VECTOR)) - safe_apic_wait_icr_idle(); + apic_mem_wait_icr_idle_timeout(); else - __xapic_wait_icr_idle(); + apic_mem_wait_icr_idle(); - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - native_apic_mem_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, dest); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); + /* Set the IPI destination field in the ICR */ + native_apic_mem_write(APIC_ICR2, __prepare_ICR2(dest_mask)); + /* Send it with the proper destination mode */ + native_apic_mem_write(APIC_ICR, __prepare_ICR(0, vector, dest_mode)); } void default_send_IPI_single_phys(int cpu, int vector) @@ -184,18 +189,13 @@ void default_send_IPI_single_phys(int cpu, int vector) void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) { - unsigned long query_cpu; unsigned long flags; + unsigned long cpu; - /* - * Hack. The clustered APIC addressing mode doesn't allow us to send - * to an arbitrary mask, so I do a unicast to each CPU instead. - * - mbligh - */ local_irq_save(flags); - for_each_cpu(query_cpu, mask) { + for_each_cpu(cpu, mask) { __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, - query_cpu), vector, APIC_DEST_PHYSICAL); + cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } @@ -203,18 +203,15 @@ void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector) { - unsigned int this_cpu = smp_processor_id(); - unsigned int query_cpu; + unsigned int cpu, this_cpu = smp_processor_id(); unsigned long flags; - /* See Hack comment above */ - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu == this_cpu) + for_each_cpu(cpu, mask) { + if (cpu == this_cpu) continue; __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, - query_cpu), vector, APIC_DEST_PHYSICAL); + cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } @@ -224,7 +221,7 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, */ void default_send_IPI_single(int cpu, int vector) { - apic->send_IPI_mask(cpumask_of(cpu), vector); + __apic_send_IPI_mask(cpumask_of(cpu), vector); } void default_send_IPI_allbutself(int vector) @@ -243,50 +240,32 @@ void default_send_IPI_self(int vector) } #ifdef CONFIG_X86_32 - -void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, - int vector) +void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector) { unsigned long flags; - unsigned int query_cpu; - - /* - * Hack. The clustered APIC addressing mode doesn't allow us to send - * to an arbitrary mask, so I do a unicasts to each CPU instead. This - * should be modified to do 1 message per cluster ID - mbligh - */ + unsigned int cpu; local_irq_save(flags); - for_each_cpu(query_cpu, mask) - __default_send_IPI_dest_field( - early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, APIC_DEST_LOGICAL); + for_each_cpu(cpu, mask) + __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector) { + unsigned int cpu, this_cpu = smp_processor_id(); unsigned long flags; - unsigned int query_cpu; - unsigned int this_cpu = smp_processor_id(); - - /* See Hack comment above */ local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu == this_cpu) + for_each_cpu(cpu, mask) { + if (cpu == this_cpu) continue; - __default_send_IPI_dest_field( - early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, APIC_DEST_LOGICAL); - } + __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL); + } local_irq_restore(flags); } -/* - * This is only used on smaller machines. - */ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) { unsigned long mask = cpumask_bits(cpumask)[0]; @@ -301,7 +280,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) local_irq_restore(flags); } -/* must come after the send_IPI functions above for inlining */ +#ifdef CONFIG_SMP static int convert_apicid_to_cpu(int apic_id) { int i; @@ -320,7 +299,7 @@ int safe_smp_processor_id(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return 0; - apicid = hard_smp_processor_id(); + apicid = read_apic_id(); if (apicid == BAD_APICID) return 0; @@ -329,3 +308,4 @@ int safe_smp_processor_id(void) return cpuid >= 0 ? cpuid : 0; } #endif +#endif diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index a997d849509a..ec219c659c7d 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -13,18 +13,16 @@ #include <asm/irq_vectors.h> #include <asm/apic.h> -/* APIC flat 64 */ -void flat_init_apic_ldr(void); - /* X2APIC */ -int x2apic_apic_id_valid(u32 apicid); -int x2apic_apic_id_registered(void); void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); unsigned int x2apic_get_apic_id(unsigned long id); u32 x2apic_set_apic_id(unsigned int id); int x2apic_phys_pkg_id(int initial_apicid, int index_msb); + +void x2apic_send_IPI_all(int vector); +void x2apic_send_IPI_allbutself(int vector); void x2apic_send_IPI_self(int vector); -void __x2apic_send_IPI_shorthand(int vector, u32 which); +extern u32 x2apic_max_apicid; /* IPI */ @@ -46,7 +44,10 @@ static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector, return icr; } -void __default_send_IPI_shortcut(unsigned int shortcut, int vector); +void default_init_apic_ldr(void); + +void apic_mem_wait_icr_idle(void); +u32 apic_mem_wait_icr_idle_timeout(void); /* * This is used to send an IPI with no shorthand notation (the destination is @@ -62,8 +63,23 @@ void default_send_IPI_allbutself(int vector); void default_send_IPI_all(int vector); void default_send_IPI_self(int vector); +bool default_apic_id_registered(void); + #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); +void x86_32_probe_bigsmp_early(void); +void x86_32_install_bigsmp(void); +#else +static inline void x86_32_probe_bigsmp_early(void) { } +static inline void x86_32_install_bigsmp(void) { } +#endif + +#ifdef CONFIG_X86_BIGSMP +bool apic_bigsmp_possible(bool cmdline_selected); +void apic_bigsmp_force(void); +#else +static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; }; +static inline void apic_bigsmp_force(void) { } #endif diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 35d5b8fb18ef..6b6b711678fe 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -269,7 +269,7 @@ static const struct msi_parent_ops x86_vector_msi_parent_ops = { struct irq_domain * __init native_create_pci_msi_domain(void) { - if (disable_apic) + if (apic_is_disabled) return NULL; x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index a61f642b1b90..9a06df6cdd68 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -10,46 +10,14 @@ #include <linux/errno.h> #include <linux/smp.h> +#include <xen/xen.h> + #include <asm/io_apic.h> #include <asm/apic.h> #include <asm/acpi.h> #include "local.h" -static int default_x86_32_early_logical_apicid(int cpu) -{ - return 1 << cpu; -} - -static void setup_apic_flat_routing(void) -{ -#ifdef CONFIG_X86_IO_APIC - printk(KERN_INFO - "Enabling APIC mode: Flat. Using %d I/O APICs\n", - nr_ioapics); -#endif -} - -static int default_apic_id_registered(void) -{ - return physid_isset(read_apic_id(), phys_cpu_present_map); -} - -/* - * Set up the logical destination ID. Intel recommends to set DFR, LDR and - * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual" - * (Intel document number 292116). - */ -static void default_init_apic_ldr(void) -{ - unsigned long val; - - apic_write(APIC_DFR, APIC_DFR_VALUE); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); - apic_write(APIC_LDR, val); -} - static int default_phys_pkg_id(int cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; @@ -65,8 +33,6 @@ static struct apic apic_default __ro_after_init = { .name = "default", .probe = probe_default, - .acpi_madt_oem_check = NULL, - .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, @@ -77,14 +43,11 @@ static struct apic apic_default __ro_after_init = { .check_apicid_used = default_check_apicid_used, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = setup_apic_flat_routing, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = default_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = default_get_apic_id, - .set_apic_id = NULL, .calc_dest_apicid = apic_flat_calc_apicid, @@ -95,17 +58,13 @@ static struct apic apic_default __ro_after_init = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; apic_driver(apic_default); @@ -123,7 +82,7 @@ static int __init parse_apic(char *arg) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if (!strcmp((*drv)->name, arg)) { - apic = *drv; + apic_install_driver(*drv); cmdline_apic = 1; return 0; } @@ -134,49 +93,43 @@ static int __init parse_apic(char *arg) } early_param("apic", parse_apic); -void __init default_setup_apic_routing(void) +void __init x86_32_probe_bigsmp_early(void) { - int version = boot_cpu_apic_version; + if (nr_cpu_ids <= 8 || xen_pv_domain()) + return; - if (num_possible_cpus() > 8) { + if (IS_ENABLED(CONFIG_X86_BIGSMP)) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; + if (!APIC_XAPIC(boot_cpu_apic_version)) break; - } /* P4 and above */ fallthrough; case X86_VENDOR_HYGON: case X86_VENDOR_AMD: - def_to_bigsmp = 1; + if (apic_bigsmp_possible(cmdline_apic)) + return; + break; } } + pr_info("Limiting to 8 possible CPUs\n"); + set_nr_cpu_ids(8); +} -#ifdef CONFIG_X86_BIGSMP - /* - * This is used to switch to bigsmp mode when - * - There is no apic= option specified by the user - * - generic_apic_probe() has chosen apic_default as the sub_arch - * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support - */ - - if (!cmdline_apic && apic == &apic_default) - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); +void __init x86_32_install_bigsmp(void) +{ + if (nr_cpu_ids > 8 && !xen_pv_domain()) + apic_bigsmp_force(); } -void __init generic_apic_probe(void) +void __init x86_32_probe_apic(void) { if (!cmdline_apic) { struct apic **drv; for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if ((*drv)->probe()) { - apic = *drv; + apic_install_driver(*drv); break; } } @@ -184,26 +137,4 @@ void __init generic_apic_probe(void) if (drv == __apicdrivers_end) panic("Didn't find an APIC driver"); } - printk(KERN_INFO "Using APIC driver %s\n", apic->name); -} - -/* This function can switch the APIC even after the initial ->probe() */ -int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { - if (!(*drv)->acpi_madt_oem_check) - continue; - if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) - continue; - - if (!cmdline_apic) { - apic = *drv; - printk(KERN_INFO "Switched to APIC driver `%s'.\n", - apic->name); - } - return 1; - } - return 0; } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index c46720f185c0..ecdf0c4121e1 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -13,10 +13,8 @@ #include "local.h" -/* - * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. - */ -void __init default_setup_apic_routing(void) +/* Select the appropriate APIC driver */ +void __init x86_64_probe_apic(void) { struct apic **drv; @@ -24,11 +22,7 @@ void __init default_setup_apic_routing(void) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if ((*drv)->probe && (*drv)->probe()) { - if (apic != *drv) { - apic = *drv; - pr_info("Switched APIC routing to %s.\n", - apic->name); - } + apic_install_driver(*drv); break; } } @@ -40,11 +34,7 @@ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) { - if (apic != *drv) { - apic = *drv; - pr_info("Setting APIC routing to %s.\n", - apic->name); - } + apic_install_driver(*drv); return 1; } } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index c1efebd27e6c..319448d87b99 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -44,7 +44,18 @@ static cpumask_var_t vector_searchmask; static struct irq_chip lapic_controller; static struct irq_matrix *vector_matrix; #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct hlist_head, cleanup_list); + +static void vector_cleanup_callback(struct timer_list *tmr); + +struct vector_cleanup { + struct hlist_head head; + struct timer_list timer; +}; + +static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = { + .head = HLIST_HEAD_INIT, + .timer = __TIMER_INITIALIZER(vector_cleanup_callback, TIMER_PINNED), +}; #endif void lock_vector_lock(void) @@ -536,7 +547,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, struct irq_data *irqd; int i, err, node; - if (disable_apic) + if (apic_is_disabled) return -ENXIO; /* @@ -680,7 +691,7 @@ static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec, * if IRQ remapping is enabled. APIC IDs above 15 bits are * only permitted if IRQ remapping is enabled, so check that. */ - if (apic->apic_id_valid(32768)) + if (apic_id_valid(32768)) return 0; return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec); @@ -841,10 +852,21 @@ void lapic_online(void) this_cpu_write(vector_irq[vector], __setup_vector_irq(vector)); } +static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr); + void lapic_offline(void) { + struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup); + lock_vector_lock(); + + /* In case the vector cleanup timer has not expired */ + __vector_cleanup(cl, false); + irq_matrix_offline(vector_matrix); + WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0); + WARN_ON_ONCE(!hlist_empty(&cl->head)); + unlock_vector_lock(); } @@ -876,7 +898,7 @@ static int apic_retrigger_irq(struct irq_data *irqd) unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - apic->send_IPI(apicd->cpu, apicd->vector); + __apic_send_IPI(apicd->cpu, apicd->vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -885,7 +907,7 @@ static int apic_retrigger_irq(struct irq_data *irqd) void apic_ack_irq(struct irq_data *irqd) { irq_move_irq(irqd); - ack_APIC_irq(); + apic_eoi(); } void apic_ack_edge(struct irq_data *irqd) @@ -934,62 +956,98 @@ static void free_moved_vector(struct apic_chip_data *apicd) apicd->move_in_progress = 0; } -DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup) +static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) { - struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); struct apic_chip_data *apicd; struct hlist_node *tmp; + bool rearm = false; - ack_APIC_irq(); - /* Prevent vectors vanishing under us */ - raw_spin_lock(&vector_lock); + lockdep_assert_held(&vector_lock); - hlist_for_each_entry_safe(apicd, tmp, clhead, clist) { + hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { unsigned int irr, vector = apicd->prev_vector; /* * Paranoia: Check if the vector that needs to be cleaned - * up is registered at the APICs IRR. If so, then this is - * not the best time to clean it up. Clean it up in the - * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR - * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest - * priority external vector, so on return from this - * interrupt the device interrupt will happen first. + * up is registered at the APICs IRR. That's clearly a + * hardware issue if the vector arrived on the old target + * _after_ interrupts were disabled above. Keep @apicd + * on the list and schedule the timer again to give the CPU + * a chance to handle the pending interrupt. + * + * Do not check IRR when called from lapic_offline(), because + * fixup_irqs() was just called to scan IRR for set bits and + * forward them to new destination CPUs via IPIs. */ - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0; if (irr & (1U << (vector % 32))) { - apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq); + rearm = true; continue; } free_moved_vector(apicd); } - raw_spin_unlock(&vector_lock); + /* + * Must happen under vector_lock to make the timer_pending() check + * in __vector_schedule_cleanup() race free against the rearm here. + */ + if (rearm) + mod_timer(&cl->timer, jiffies + 1); +} + +static void vector_cleanup_callback(struct timer_list *tmr) +{ + struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer); + + /* Prevent vectors vanishing under us */ + raw_spin_lock_irq(&vector_lock); + __vector_cleanup(cl, true); + raw_spin_unlock_irq(&vector_lock); } -static void __send_cleanup_vector(struct apic_chip_data *apicd) +static void __vector_schedule_cleanup(struct apic_chip_data *apicd) { - unsigned int cpu; + unsigned int cpu = apicd->prev_cpu; raw_spin_lock(&vector_lock); apicd->move_in_progress = 0; - cpu = apicd->prev_cpu; if (cpu_online(cpu)) { - hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu)); - apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR); + struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu); + + hlist_add_head(&apicd->clist, &cl->head); + + /* + * The lockless timer_pending() check is safe here. If it + * returns true, then the callback will observe this new + * apic data in the hlist as everything is serialized by + * vector lock. + * + * If it returns false then the timer is either not armed + * or the other CPU executes the callback, which again + * would be blocked on vector lock. Rearming it in the + * latter case makes it fire for nothing. + * + * This is also safe against the callback rearming the timer + * because that's serialized via vector lock too. + */ + if (!timer_pending(&cl->timer)) { + cl->timer.expires = jiffies + 1; + add_timer_on(&cl->timer, cpu); + } } else { apicd->prev_vector = 0; } raw_spin_unlock(&vector_lock); } -void send_cleanup_vector(struct irq_cfg *cfg) +void vector_schedule_cleanup(struct irq_cfg *cfg) { struct apic_chip_data *apicd; apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); if (apicd->move_in_progress) - __send_cleanup_vector(apicd); + __vector_schedule_cleanup(apicd); } void irq_complete_move(struct irq_cfg *cfg) @@ -1007,7 +1065,7 @@ void irq_complete_move(struct irq_cfg *cfg) * on the same CPU. */ if (apicd->cpu == smp_processor_id()) - __send_cleanup_vector(apicd); + __vector_schedule_cleanup(apicd); } /* @@ -1150,7 +1208,7 @@ static void __init print_local_APIC(void *dummy) u64 icr; pr_debug("printing local APIC contents on CPU#%d/%d:\n", - smp_processor_id(), hard_smp_processor_id()); + smp_processor_id(), read_apic_id()); v = apic_read(APIC_ID); pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id()); v = apic_read(APIC_LVR); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index b2b2b7f3e03f..affbff65e497 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -83,16 +83,6 @@ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } -static void x2apic_send_IPI_allbutself(int vector) -{ - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); -} - -static void x2apic_send_IPI_all(int vector) -{ - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); -} - static u32 x2apic_calc_apicid(unsigned int cpu) { return x86_cpu_to_logical_apicid[cpu]; @@ -236,8 +226,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = x2apic_apic_id_valid, - .apic_id_registered = x2apic_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, @@ -247,12 +235,11 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = x2apic_phys_pkg_id, + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, @@ -265,15 +252,11 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .inquire_remote_apic = NULL, - .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_eoi_write, + .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, - .wait_icr_idle = native_x2apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; apic_driver(apic_x2apic_cluster); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 896bc41cb2ba..788cdb4ee394 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -8,11 +8,13 @@ int x2apic_phys; static struct apic apic_x2apic_phys; -static u32 x2apic_max_apicid __ro_after_init; +u32 x2apic_max_apicid __ro_after_init = UINT_MAX; void __init x2apic_set_max_apicid(u32 apicid) { x2apic_max_apicid = apicid; + if (apic->x2apic_set_max_apicid) + apic->max_apic_id = apicid; } static int __init set_x2apic_phys_mode(char *arg) @@ -81,43 +83,28 @@ static void __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } -static void x2apic_send_IPI_allbutself(int vector) +static void __x2apic_send_IPI_shorthand(int vector, u32 which) { - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); -} + unsigned long cfg = __prepare_ICR(which, vector, 0); -static void x2apic_send_IPI_all(int vector) -{ - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + native_x2apic_icr_write(cfg, 0); } -static void init_x2apic_ldr(void) +void x2apic_send_IPI_allbutself(int vector) { + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); } -static int x2apic_phys_probe(void) -{ - if (!x2apic_mode) - return 0; - - if (x2apic_phys || x2apic_fadt_phys()) - return 1; - - return apic == &apic_x2apic_phys; -} - -/* Common x2apic functions, also used by x2apic_cluster */ -int x2apic_apic_id_valid(u32 apicid) +void x2apic_send_IPI_all(int vector) { - if (x2apic_max_apicid && apicid > x2apic_max_apicid) - return 0; - - return 1; + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); } -int x2apic_apic_id_registered(void) +void x2apic_send_IPI_self(int vector) { - return 1; + apic_write(APIC_SELF_IPI, vector); } void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) @@ -126,13 +113,15 @@ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) native_x2apic_icr_write(cfg, apicid); } -void __x2apic_send_IPI_shorthand(int vector, u32 which) +static int x2apic_phys_probe(void) { - unsigned long cfg = __prepare_ICR(which, vector, 0); + if (!x2apic_mode) + return 0; - /* x2apic MSRs are special and need a special fence: */ - weak_wrmsr_fence(); - native_x2apic_icr_write(cfg, 0); + if (x2apic_phys || x2apic_fadt_phys()) + return 1; + + return apic == &apic_x2apic_phys; } unsigned int x2apic_get_apic_id(unsigned long id) @@ -150,33 +139,22 @@ int x2apic_phys_pkg_id(int initial_apicid, int index_msb) return initial_apicid >> index_msb; } -void x2apic_send_IPI_self(int vector) -{ - apic_write(APIC_SELF_IPI, vector); -} - static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = x2apic_apic_id_valid, - .apic_id_registered = x2apic_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = x2apic_phys_pkg_id, + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, @@ -189,15 +167,11 @@ static struct apic apic_x2apic_phys __ro_after_init = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .inquire_remote_apic = NULL, - .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_eoi_write, + .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, - .wait_icr_idle = native_x2apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; apic_driver(apic_x2apic_phys); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d9384d5b4b8e..d9f5d7492f83 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -25,6 +25,8 @@ #include <asm/uv/uv.h> #include <asm/apic.h> +#include "local.h" + static enum uv_system_type uv_system_type; static int uv_hubbed_system; static int uv_hubless_system; @@ -294,8 +296,7 @@ static void __init early_get_apic_socketid_shift(void) static void __init uv_stringify(int len, char *to, char *from) { - /* Relies on 'to' being NULL chars so result will be NULL terminated */ - strncpy(to, from, len-1); + strscpy(to, from, len); /* Trim trailing spaces */ (void)strim(to); @@ -778,30 +779,6 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } -static int uv_apic_id_valid(u32 apicid) -{ - return 1; -} - -static int uv_apic_id_registered(void) -{ - return 1; -} - -static void uv_init_apic_ldr(void) -{ -} - -static u32 apic_uv_calc_apicid(unsigned int cpu) -{ - return apic_default_calc_apicid(cpu); -} - -static unsigned int x2apic_get_apic_id(unsigned long id) -{ - return id; -} - static u32 set_apic_id(unsigned int id) { return id; @@ -817,11 +794,6 @@ static int uv_phys_pkg_id(int initial_apicid, int index_msb) return uv_read_apic_id() >> index_msb; } -static void uv_send_IPI_self(int vector) -{ - apic_write(APIC_SELF_IPI, vector); -} - static int uv_probe(void) { return apic == &apic_x2apic_uv_x; @@ -832,45 +804,35 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .name = "UV large system", .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, - .apic_id_valid = uv_apic_id_valid, - .apic_id_registered = uv_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = uv_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = uv_phys_pkg_id, + .max_apic_id = UINT_MAX, .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .calc_dest_apicid = apic_uv_calc_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = uv_send_IPI_one, .send_IPI_mask = uv_send_IPI_mask, .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_all = uv_send_IPI_all, - .send_IPI_self = uv_send_IPI_self, + .send_IPI_self = x2apic_send_IPI_self, .wakeup_secondary_cpu = uv_wakeup_secondary, - .inquire_remote_apic = NULL, .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_eoi_write, + .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, - .wait_icr_idle = native_x2apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 @@ -1013,7 +975,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index, /* One (UV2) mapping */ if (index == UV2_MMIOH) { - strncpy(id, "MMIOH", sizeof(id)); + strscpy(id, "MMIOH", sizeof(id)); max_io = max_pnode; mapped = 0; goto map_exit; @@ -1845,7 +1807,7 @@ static void __init uv_system_init_hub(void) /* Initialize per CPU info: */ for_each_possible_cpu(cpu) { - int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + int apicid = per_cpu(x86_cpu_to_apicid, cpu); unsigned short bid; unsigned short pnode; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index c6c15ce1952f..5934ee5bc087 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -239,12 +239,6 @@ extern int (*console_blank_hook)(int); #endif /* - * The apm_bios device is one of the misc char devices. - * This is its minor number. - */ -#define APM_MINOR_DEV 134 - -/* * Various options can be changed at boot time as follows: * (We allow underscores for compatibility with the modules code) * apm=on/off enable/disable APM diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 44c3601cfdc4..190c120f4285 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -63,11 +63,6 @@ int audit_classify_syscall(int abi, unsigned syscall) static int __init audit_classes_init(void) { #ifdef CONFIG_IA32_EMULATION - extern __u32 ia32_dir_class[]; - extern __u32 ia32_write_class[]; - extern __u32 ia32_read_class[]; - extern __u32 ia32_chattr_class[]; - extern __u32 ia32_signal_class[]; audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c new file mode 100644 index 000000000000..d2c732a34e5d --- /dev/null +++ b/arch/x86/kernel/cet.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/ptrace.h> +#include <asm/bugs.h> +#include <asm/traps.h> + +enum cp_error_code { + CP_EC = (1 << 15) - 1, + + CP_RET = 1, + CP_IRET = 2, + CP_ENDBR = 3, + CP_RSTRORSSP = 4, + CP_SETSSBSY = 5, + + CP_ENCL = 1 << 15, +}; + +static const char cp_err[][10] = { + [0] = "unknown", + [1] = "near ret", + [2] = "far/iret", + [3] = "endbranch", + [4] = "rstorssp", + [5] = "setssbsy", +}; + +static const char *cp_err_string(unsigned long error_code) +{ + unsigned int cpec = error_code & CP_EC; + + if (cpec >= ARRAY_SIZE(cp_err)) + cpec = 0; + return cp_err[cpec]; +} + +static void do_unexpected_cp(struct pt_regs *regs, unsigned long error_code) +{ + WARN_ONCE(1, "Unexpected %s #CP, error_code: %s\n", + user_mode(regs) ? "user mode" : "kernel mode", + cp_err_string(error_code)); +} + +static DEFINE_RATELIMIT_STATE(cpf_rate, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct task_struct *tsk; + unsigned long ssp; + + /* + * An exception was just taken from userspace. Since interrupts are disabled + * here, no scheduling should have messed with the registers yet and they + * will be whatever is live in userspace. So read the SSP before enabling + * interrupts so locking the fpregs to do it later is not required. + */ + rdmsrl(MSR_IA32_PL3_SSP, ssp); + + cond_local_irq_enable(regs); + + tsk = current; + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_CP; + + /* Ratelimit to prevent log spamming. */ + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + __ratelimit(&cpf_rate)) { + pr_emerg("%s[%d] control protection ip:%lx sp:%lx ssp:%lx error:%lx(%s)%s", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, ssp, error_code, + cp_err_string(error_code), + error_code & CP_ENCL ? " in enclave" : ""); + print_vma_addr(KERN_CONT " in ", regs->ip); + pr_cont("\n"); + } + + force_sig_fault(SIGSEGV, SEGV_CPERR, (void __user *)0); + cond_local_irq_disable(regs); +} + +static __ro_after_init bool ibt_fatal = true; + +static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) +{ + if ((error_code & CP_EC) != CP_ENDBR) { + do_unexpected_cp(regs, error_code); + return; + } + + if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) { + regs->ax = 0; + return; + } + + pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); + if (!ibt_fatal) { + printk(KERN_DEFAULT CUT_HERE); + __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + return; + } + BUG(); +} + +static int __init ibt_setup(char *str) +{ + if (!strcmp(str, "off")) + setup_clear_cpu_cap(X86_FEATURE_IBT); + + if (!strcmp(str, "warn")) + ibt_fatal = false; + + return 1; +} + +__setup("ibt=", ibt_setup); + +DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) +{ + if (user_mode(regs)) { + if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + do_user_cp_fault(regs, error_code); + else + do_unexpected_cp(regs, error_code); + } else { + if (cpu_feature_enabled(X86_FEATURE_IBT)) + do_kernel_cp_fault(regs, error_code); + else + do_unexpected_cp(regs, error_code); + } +} diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 485441b7f030..bfeb18fad63f 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -51,7 +51,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) * will block the interrupt whose vector is lower than * HYPERVISOR_CALLBACK_VECTOR. */ - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(irq_hv_callback_count); if (acrn_intr_handler) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7eca6a8abbb1..dd8379d84445 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1047,7 +1047,7 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_FSRS); /* get apicid instead of initial apic id from cpuid */ - c->apicid = hard_smp_processor_id(); + c->apicid = read_apic_id(); /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e3a65e9fc750..382d4e6b848d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -59,7 +59,6 @@ #include <asm/cacheinfo.h> #include <asm/memtype.h> #include <asm/microcode.h> -#include <asm/microcode_intel.h> #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/uv/uv.h> @@ -588,27 +587,43 @@ __noendbr void ibt_restore(u64 save) static __always_inline void setup_cet(struct cpuinfo_x86 *c) { - u64 msr = CET_ENDBR_EN; + bool user_shstk, kernel_ibt; - if (!HAS_KERNEL_IBT || - !cpu_feature_enabled(X86_FEATURE_IBT)) + if (!IS_ENABLED(CONFIG_X86_CET)) return; - wrmsrl(MSR_IA32_S_CET, msr); + kernel_ibt = HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT); + user_shstk = cpu_feature_enabled(X86_FEATURE_SHSTK) && + IS_ENABLED(CONFIG_X86_USER_SHADOW_STACK); + + if (!kernel_ibt && !user_shstk) + return; + + if (user_shstk) + set_cpu_cap(c, X86_FEATURE_USER_SHSTK); + + if (kernel_ibt) + wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN); + else + wrmsrl(MSR_IA32_S_CET, 0); + cr4_set_bits(X86_CR4_CET); - if (!ibt_selftest()) { + if (kernel_ibt && ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); wrmsrl(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); - return; } } __noendbr void cet_disable(void) { - if (cpu_feature_enabled(X86_FEATURE_IBT)) - wrmsrl(MSR_IA32_S_CET, 0); + if (!(cpu_feature_enabled(X86_FEATURE_IBT) || + cpu_feature_enabled(X86_FEATURE_SHSTK))) + return; + + wrmsrl(MSR_IA32_S_CET, 0); + wrmsrl(MSR_IA32_U_CET, 0); } /* @@ -1265,11 +1280,11 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), @@ -1492,6 +1507,9 @@ static void __init cpu_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "noxsaves")) setup_clear_cpu_cap(X86_FEATURE_XSAVES); + if (cmdline_find_option_bool(boot_command_line, "nousershstk")) + setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); if (arglen <= 0) return; @@ -1959,7 +1977,7 @@ void enable_sep_cpu(void) } #endif -void __init identify_boot_cpu(void) +static __init void identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) @@ -2300,8 +2318,7 @@ void store_cpu_caps(struct cpuinfo_x86 *curr_info) * @prev_info: CPU capabilities stored before an update. * * The microcode loader calls this upon late microcode load to recheck features, - * only when microcode has been updated. Caller holds microcode_mutex and CPU - * hotplug lock. + * only when microcode has been updated. Caller holds and CPU hotplug lock. * * Return: None */ @@ -2343,7 +2360,7 @@ void __init arch_cpu_finalize_init(void) * identify_boot_cpu() initialized SMT support information, let the * core code know. */ - cpu_smt_check_topology(); + cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings); if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index f6748c8bd647..e462c1d3800a 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -81,6 +81,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, + { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, {} }; diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 5a2962c492d3..defdc594be14 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -8,6 +8,7 @@ */ #include <linux/io.h> +#include <asm/apic.h> #include <asm/cpu.h> #include <asm/smp.h> #include <asm/numa.h> @@ -300,7 +301,7 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ - c->apicid = hard_smp_processor_id(); + c->apicid = read_apic_id(); /* * XXX someone from Hygon needs to confirm this DTRT diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1c4639588ff9..be4045628fd3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -20,7 +20,7 @@ #include <asm/bugs.h> #include <asm/cpu.h> #include <asm/intel-family.h> -#include <asm/microcode_intel.h> +#include <asm/microcode.h> #include <asm/hwcap2.h> #include <asm/elf.h> #include <asm/cpu_device_id.h> @@ -184,180 +184,6 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } -int intel_cpu_collect_info(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig = { 0 }; - unsigned int eax, ebx, ecx, edx; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - family = x86_family(eax); - model = x86_model(eax); - - if (model >= 5 || family > 6) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - - csig.rev = intel_get_microcode_revision(); - - uci->cpu_sig = csig; - - return 0; -} -EXPORT_SYMBOL_GPL(intel_cpu_collect_info); - -/* - * Returns 1 if update has been found, 0 otherwise. - */ -int intel_find_matching_signature(void *mc, unsigned int csig, int cpf) -{ - struct microcode_header_intel *mc_hdr = mc; - struct extended_sigtable *ext_hdr; - struct extended_signature *ext_sig; - int i; - - if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) - return 1; - - /* Look for ext. headers: */ - if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) - return 0; - - ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; - ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - - for (i = 0; i < ext_hdr->count; i++) { - if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) - return 1; - ext_sig++; - } - return 0; -} -EXPORT_SYMBOL_GPL(intel_find_matching_signature); - -/** - * intel_microcode_sanity_check() - Sanity check microcode file. - * @mc: Pointer to the microcode file contents. - * @print_err: Display failure reason if true, silent if false. - * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file. - * Validate if the microcode header type matches with the type - * specified here. - * - * Validate certain header fields and verify if computed checksum matches - * with the one specified in the header. - * - * Return: 0 if the file passes all the checks, -EINVAL if any of the checks - * fail. - */ -int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type) -{ - unsigned long total_size, data_size, ext_table_size; - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - u32 sum, orig_sum, ext_sigcount = 0, i; - struct extended_signature *ext_sig; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - - if (data_size + MC_HEADER_SIZE > total_size) { - if (print_err) - pr_err("Error: bad microcode data file size.\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) { - if (print_err) - pr_err("Error: invalid/unknown microcode update format. Header type %d\n", - mc_header->hdrver); - return -EINVAL; - } - - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - u32 ext_table_sum = 0; - u32 *ext_tablep; - - if (ext_table_size < EXT_HEADER_SIZE || - ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - if (print_err) - pr_err("Error: truncated extended signature table.\n"); - return -EINVAL; - } - - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - if (print_err) - pr_err("Error: extended signature table size mismatch.\n"); - return -EFAULT; - } - - ext_sigcount = ext_header->count; - - /* - * Check extended table checksum: the sum of all dwords that - * comprise a valid table must be 0. - */ - ext_tablep = (u32 *)ext_header; - - i = ext_table_size / sizeof(u32); - while (i--) - ext_table_sum += ext_tablep[i]; - - if (ext_table_sum) { - if (print_err) - pr_warn("Bad extended signature table checksum, aborting.\n"); - return -EINVAL; - } - } - - /* - * Calculate the checksum of update data and header. The checksum of - * valid update data and header including the extended signature table - * must be 0. - */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / sizeof(u32); - while (i--) - orig_sum += ((u32 *)mc)[i]; - - if (orig_sum) { - if (print_err) - pr_err("Bad microcode data checksum, aborting.\n"); - return -EINVAL; - } - - if (!ext_table_size) - return 0; - - /* - * Check extended signature checksum: 0 => valid. - */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - - sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - - (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - if (print_err) - pr_err("Bad extended signature checksum, aborting.\n"); - return -EINVAL; - } - } - return 0; -} -EXPORT_SYMBOL_GPL(intel_microcode_sanity_check); - static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index 3b8476158236..e4c3ba91321c 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -206,7 +206,7 @@ static int intel_epb_offline(unsigned int cpu) static const struct x86_cpu_id intel_epb_normal[] = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, ENERGY_PERF_BIAS_NORMAL_POWERSAVE), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, ENERGY_PERF_BIAS_NORMAL_POWERSAVE), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, ENERGY_PERF_BIAS_NORMAL_POWERSAVE), diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c4ec4ca47e11..c267f43de39e 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -759,7 +759,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) inc_irq_stat(irq_deferred_error_count); deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); - ack_APIC_irq(); + apic_eoi(); } /* diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 89e2aab5d34d..6f35f724cc14 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -843,6 +843,26 @@ static noinstr bool quirk_skylake_repmov(void) } /* + * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption + * errors. This means mce_gather_info() will not save the "ip" and "cs" registers. + * + * However, the context is still valid, so save the "cs" register for later use. + * + * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV. + * + * The Instruction Fetch Unit is at MCA bank 1 for all affected systems. + */ +static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 1) + return; + if (!(m->status & MCI_STATUS_POISON)) + return; + + m->cs = regs->cs; +} + +/* * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ @@ -861,6 +881,9 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo if (mce_flags.snb_ifu_quirk) quirk_sandybridge_ifu(i, m, regs); + if (mce_flags.zen_ifu_quirk) + quirk_zen_ifu(i, m, regs); + m->bank = i; if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { mce_read_aux(m, i); @@ -1608,6 +1631,13 @@ static void __start_timer(struct timer_list *t, unsigned long interval) local_irq_restore(flags); } +static void mc_poll_banks_default(void) +{ + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); +} + +void (*mc_poll_banks)(void) = mc_poll_banks_default; + static void mce_timer_fn(struct timer_list *t) { struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); @@ -1618,7 +1648,7 @@ static void mce_timer_fn(struct timer_list *t) iv = __this_cpu_read(mce_next_interval); if (mce_available(this_cpu_ptr(&cpu_info))) { - machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); + mc_poll_banks(); if (mce_intel_cmci_poll()) { iv = mce_adjust_timer(iv); @@ -1842,6 +1872,9 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 0x15 && c->x86_model <= 0xf) mce_flags.overflow_recov = 1; + if (c->x86 >= 0x17 && c->x86 <= 0x1A) + mce_flags.zen_ifu_quirk = 1; + } if (c->x86_vendor == X86_VENDOR_INTEL) { diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 12cf2e7ca33c..4d8d4bcf915d 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -270,8 +270,7 @@ static void __maybe_unused raise_mce(struct mce *m) mce_irq_ipi, NULL, 0); preempt_enable(); } else if (m->inject_flags & MCJ_NMI_BROADCAST) - apic->send_IPI_mask(mce_inject_cpumask, - NMI_VECTOR); + __apic_send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); } start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 95275a5e57e0..f5323551c1a9 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -56,6 +56,13 @@ static DEFINE_PER_CPU(int, cmci_backoff_cnt); */ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); +/* + * On systems that do support CMCI but it's disabled, polling for MCEs can + * cause the same event to be reported multiple times because IA32_MCi_STATUS + * is shared by the same package. + */ +static DEFINE_SPINLOCK(cmci_poll_lock); + #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) #define CMCI_STORM_INTERVAL (HZ) @@ -426,12 +433,22 @@ void cmci_disable_bank(int bank) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } +/* Bank polling function when CMCI is disabled. */ +static void cmci_mc_poll_banks(void) +{ + spin_lock(&cmci_poll_lock); + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); + spin_unlock(&cmci_poll_lock); +} + void intel_init_cmci(void) { int banks; - if (!cmci_supported(&banks)) + if (!cmci_supported(&banks)) { + mc_poll_banks = cmci_mc_poll_banks; return; + } mce_threshold_vector = intel_threshold_interrupt; cmci_discover(banks); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index d2412ce2d312..bcf1b3c66c9c 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -157,6 +157,9 @@ struct mce_vendor_flags { */ smca : 1, + /* Zen IFU quirk */ + zen_ifu_quirk : 1, + /* AMD-style error thresholding banks present. */ amd_threshold : 1, @@ -172,7 +175,7 @@ struct mce_vendor_flags { /* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */ skx_repmov_quirk : 1, - __reserved_0 : 56; + __reserved_0 : 55; }; extern struct mce_vendor_flags mce_flags; @@ -274,4 +277,5 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) return 0; } +extern void (*mc_poll_banks)(void); #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 6a059a035021..ef4e7bb5fd88 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -27,5 +27,5 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) inc_irq_stat(irq_threshold_count); mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); - ack_APIC_irq(); + apic_eoi(); } diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile index 34098d48c48f..193d98b33a0a 100644 --- a/arch/x86/kernel/cpu/microcode/Makefile +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only microcode-y := core.o obj-$(CONFIG_MICROCODE) += microcode.o -microcode-$(CONFIG_MICROCODE_INTEL) += intel.o -microcode-$(CONFIG_MICROCODE_AMD) += amd.o +microcode-$(CONFIG_CPU_SUP_INTEL) += intel.o +microcode-$(CONFIG_CPU_SUP_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 87208e46f7ed..bbd1dc38ea03 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -29,13 +29,53 @@ #include <linux/kernel.h> #include <linux/pci.h> -#include <asm/microcode_amd.h> #include <asm/microcode.h> #include <asm/processor.h> #include <asm/setup.h> #include <asm/cpu.h> #include <asm/msr.h> +#include "internal.h" + +#define UCODE_MAGIC 0x00414d44 +#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 +#define UCODE_UCODE_TYPE 0x00000001 + +#define SECTION_HDR_SIZE 8 +#define CONTAINER_HDR_SZ 12 + +struct equiv_cpu_entry { + u32 installed_cpu; + u32 fixed_errata_mask; + u32 fixed_errata_compare; + u16 equiv_cpu; + u16 res; +} __packed; + +struct microcode_header_amd { + u32 data_code; + u32 patch_id; + u16 mc_patch_data_id; + u8 mc_patch_data_len; + u8 init_flag; + u32 mc_patch_data_checksum; + u32 nb_dev_id; + u32 sb_dev_id; + u16 processor_rev_id; + u8 nb_rev_id; + u8 sb_rev_id; + u8 bios_api_rev; + u8 reserved1[3]; + u32 match_reg[8]; +} __packed; + +struct microcode_amd { + struct microcode_header_amd hdr; + unsigned int mpb[]; +}; + +#define PATCH_MAX_SIZE (3 * PAGE_SIZE) + static struct equiv_cpu_table { unsigned int num_entries; struct equiv_cpu_entry *entry; @@ -56,9 +96,6 @@ struct cont_desc { static u32 ucode_new_rev; -/* One blob per node. */ -static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE]; - /* * Microcode patch container file is prepended to the initrd in cpio * format. See Documentation/arch/x86/microcode.rst @@ -415,20 +452,17 @@ static int __apply_microcode_amd(struct microcode_amd *mc) * * Returns true if container found (sets @desc), false otherwise. */ -static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) +static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size) { struct cont_desc desc = { 0 }; - u8 (*patch)[PATCH_MAX_SIZE]; struct microcode_amd *mc; u32 rev, dummy, *new_rev; bool ret = false; #ifdef CONFIG_X86_32 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); #else new_rev = &ucode_new_rev; - patch = &amd_ucode_patch[0]; #endif desc.cpuid_1_eax = cpuid_1_eax; @@ -452,9 +486,6 @@ static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, boo if (!__apply_microcode_amd(mc)) { *new_rev = mc->hdr.patch_id; ret = true; - - if (save_patch) - memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE)); } return ret; @@ -507,7 +538,7 @@ static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret = cp; } -void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) +static void apply_ucode_from_containers(unsigned int cpuid_1_eax) { struct cpio_data cp = { }; @@ -515,42 +546,12 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) if (!(cp.data && cp.size)) return; - early_apply_microcode(cpuid_1_eax, cp.data, cp.size, true); + early_apply_microcode(cpuid_1_eax, cp.data, cp.size); } -void load_ucode_amd_ap(unsigned int cpuid_1_eax) +void load_ucode_amd_early(unsigned int cpuid_1_eax) { - struct microcode_amd *mc; - struct cpio_data cp; - u32 *new_rev, rev, dummy; - - if (IS_ENABLED(CONFIG_X86_32)) { - mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); - new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - } else { - mc = (struct microcode_amd *)amd_ucode_patch; - new_rev = &ucode_new_rev; - } - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - /* - * Check whether a new patch has been saved already. Also, allow application of - * the same revision in order to pick up SMT-thread-specific configuration even - * if the sibling SMT thread already has an up-to-date revision. - */ - if (*new_rev && rev <= mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) { - *new_rev = mc->hdr.patch_id; - return; - } - } - - find_blobs_in_containers(cpuid_1_eax, &cp); - if (!(cp.data && cp.size)) - return; - - early_apply_microcode(cpuid_1_eax, cp.data, cp.size, false); + return apply_ucode_from_containers(cpuid_1_eax); } static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); @@ -578,23 +579,6 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) return 0; } -void reload_ucode_amd(unsigned int cpu) -{ - u32 rev, dummy __always_unused; - struct microcode_amd *mc; - - mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)]; - - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - if (rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) { - ucode_new_rev = mc->hdr.patch_id; - pr_info("reload patch_level=0x%08x\n", ucode_new_rev); - } - } -} - /* * a small, trivial cache of per-family ucode patches */ @@ -655,6 +639,28 @@ static struct ucode_patch *find_patch(unsigned int cpu) return cache_find_patch(equiv_id); } +void reload_ucode_amd(unsigned int cpu) +{ + u32 rev, dummy __always_unused; + struct microcode_amd *mc; + struct ucode_patch *p; + + p = find_patch(cpu); + if (!p) + return; + + mc = p->data; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + if (rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + ucode_new_rev = mc->hdr.patch_id; + pr_info("reload patch_level=0x%08x\n", ucode_new_rev); + } + } +} + static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -875,9 +881,6 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz continue; ret = UCODE_NEW; - - memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE); - memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); } return ret; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 3afcf3de0dd4..6cc7a2c181da 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -31,15 +31,14 @@ #include <linux/fs.h> #include <linux/mm.h> -#include <asm/microcode_intel.h> #include <asm/cpu_device_id.h> -#include <asm/microcode_amd.h> #include <asm/perf_event.h> -#include <asm/microcode.h> #include <asm/processor.h> #include <asm/cmdline.h> #include <asm/setup.h> +#include "internal.h" + #define DRIVER_VERSION "2.2" static struct microcode_ops *microcode_ops; @@ -54,15 +53,12 @@ LIST_HEAD(microcode_cache); * * All non cpu-hotplug-callback call sites use: * - * - microcode_mutex to synchronize with each other; * - cpus_read_lock/unlock() to synchronize with * the cpu-hotplug-callback call sites. * * We guarantee that only a single cpu is being * updated at any particular moment of time. */ -static DEFINE_MUTEX(microcode_mutex); - struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; struct cpu_info_ctx { @@ -172,7 +168,7 @@ void __init load_ucode_bsp(void) if (intel) load_ucode_intel_bsp(); else - load_ucode_amd_bsp(cpuid_1_eax); + load_ucode_amd_early(cpuid_1_eax); } static bool check_loader_disabled_ap(void) @@ -200,7 +196,7 @@ void load_ucode_ap(void) break; case X86_VENDOR_AMD: if (x86_family(cpuid_1_eax) >= 0x10) - load_ucode_amd_ap(cpuid_1_eax); + load_ucode_amd_early(cpuid_1_eax); break; default: break; @@ -298,7 +294,7 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) #endif } -void reload_early_microcode(unsigned int cpu) +static void reload_early_microcode(unsigned int cpu) { int vendor, family; @@ -488,10 +484,7 @@ static ssize_t reload_store(struct device *dev, if (tmp_ret != UCODE_NEW) goto put; - mutex_lock(µcode_mutex); ret = microcode_reload_late(); - mutex_unlock(µcode_mutex); - put: cpus_read_unlock(); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 467cf37ea90a..94dd6af9c963 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -10,15 +10,7 @@ * Copyright (C) 2012 Fenghua Yu <[email protected]> * H Peter Anvin" <[email protected]> */ - -/* - * This needs to be before all headers so that pr_debug in printk.h doesn't turn - * printk calls into no_printk(). - * - *#define DEBUG - */ #define pr_fmt(fmt) "microcode: " fmt - #include <linux/earlycpio.h> #include <linux/firmware.h> #include <linux/uaccess.h> @@ -30,13 +22,14 @@ #include <linux/uio.h> #include <linux/mm.h> -#include <asm/microcode_intel.h> #include <asm/intel-family.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/setup.h> #include <asm/msr.h> +#include "internal.h" + static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; /* Current microcode patch used in early patching on the APs. */ @@ -45,6 +38,208 @@ static struct microcode_intel *intel_ucode_patch; /* last level cache size per core */ static int llc_size_per_core; +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[]; +}; + +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) +#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) +#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) + +static inline unsigned int get_totalsize(struct microcode_header_intel *hdr) +{ + return hdr->datasize ? hdr->totalsize : DEFAULT_UCODE_TOTALSIZE; +} + +static inline unsigned int exttable_size(struct extended_sigtable *et) +{ + return et->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE; +} + +int intel_cpu_collect_info(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig = { 0 }; + unsigned int eax, ebx, ecx, edx; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(eax); + model = x86_model(eax); + + if (model >= 5 || family > 6) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + + csig.rev = intel_get_microcode_revision(); + + uci->cpu_sig = csig; + + return 0; +} +EXPORT_SYMBOL_GPL(intel_cpu_collect_info); + +/* + * Returns 1 if update has been found, 0 otherwise. + */ +int intel_find_matching_signature(void *mc, unsigned int csig, int cpf) +{ + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; + struct extended_signature *ext_sig; + int i; + + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + return 1; + + /* Look for ext. headers: */ + if (get_totalsize(mc_hdr) <= intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE) + return 0; + + ext_hdr = mc + intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; + + for (i = 0; i < ext_hdr->count; i++) { + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} +EXPORT_SYMBOL_GPL(intel_find_matching_signature); + +/** + * intel_microcode_sanity_check() - Sanity check microcode file. + * @mc: Pointer to the microcode file contents. + * @print_err: Display failure reason if true, silent if false. + * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file. + * Validate if the microcode header type matches with the type + * specified here. + * + * Validate certain header fields and verify if computed checksum matches + * with the one specified in the header. + * + * Return: 0 if the file passes all the checks, -EINVAL if any of the checks + * fail. + */ +int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type) +{ + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + u32 sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = intel_microcode_get_datasize(mc_header); + + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("Error: bad microcode data file size.\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) { + if (print_err) + pr_err("Error: invalid/unknown microcode update format. Header type %d\n", + mc_header->hdrver); + return -EINVAL; + } + + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; + + if (ext_table_size < EXT_HEADER_SIZE || + ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("Error: truncated extended signature table.\n"); + return -EINVAL; + } + + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("Error: extended signature table size mismatch.\n"); + return -EFAULT; + } + + ext_sigcount = ext_header->count; + + /* + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. + */ + ext_tablep = (u32 *)ext_header; + + i = ext_table_size / sizeof(u32); + while (i--) + ext_table_sum += ext_tablep[i]; + + if (ext_table_sum) { + if (print_err) + pr_warn("Bad extended signature table checksum, aborting.\n"); + return -EINVAL; + } + } + + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); + while (i--) + orig_sum += ((u32 *)mc)[i]; + + if (orig_sum) { + if (print_err) + pr_err("Bad microcode data checksum, aborting.\n"); + return -EINVAL; + } + + if (!ext_table_size) + return 0; + + /* + * Check extended signature checksum: 0 => valid. + */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("Bad extended signature checksum, aborting.\n"); + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(intel_microcode_sanity_check); + /* * Returns 1 if update has been found, 0 otherwise. */ @@ -202,86 +397,6 @@ next: return patch; } -static void show_saved_mc(void) -{ -#ifdef DEBUG - int i = 0, j; - unsigned int sig, pf, rev, total_size, data_size, date; - struct ucode_cpu_info uci; - struct ucode_patch *p; - - if (list_empty(µcode_cache)) { - pr_debug("no microcode data saved.\n"); - return; - } - - intel_cpu_collect_info(&uci); - - sig = uci.cpu_sig.sig; - pf = uci.cpu_sig.pf; - rev = uci.cpu_sig.rev; - pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - - list_for_each_entry(p, µcode_cache, plist) { - struct microcode_header_intel *mc_saved_header; - struct extended_sigtable *ext_header; - struct extended_signature *ext_sig; - int ext_sigcount; - - mc_saved_header = (struct microcode_header_intel *)p->data; - - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - rev = mc_saved_header->rev; - date = mc_saved_header->date; - - total_size = get_totalsize(mc_saved_header); - data_size = get_datasize(mc_saved_header); - - pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n", - i++, sig, pf, rev, total_size, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); - - /* Look for ext. headers: */ - if (total_size <= data_size + MC_HEADER_SIZE) - continue; - - ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - - for (j = 0; j < ext_sigcount; j++) { - sig = ext_sig->sig; - pf = ext_sig->pf; - - pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", - j, sig, pf); - - ext_sig++; - } - } -#endif -} - -/* - * Save this microcode patch. It will be loaded early when a CPU is - * hot-added or resumes. - */ -static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size) -{ - /* Synchronization during CPU hotplug. */ - static DEFINE_MUTEX(x86_cpu_microcode_mutex); - - mutex_lock(&x86_cpu_microcode_mutex); - - save_microcode_patch(uci, mc, size); - show_saved_mc(); - - mutex_unlock(&x86_cpu_microcode_mutex); -} - static bool load_builtin_intel_microcode(struct cpio_data *cp) { unsigned int eax = 1, ebx, ecx = 0, edx; @@ -428,9 +543,6 @@ int __init save_microcode_in_initrd_intel(void) intel_cpu_collect_info(&uci); scan_microcode(cp.data, cp.size, &uci, true); - - show_saved_mc(); - return 0; } @@ -701,12 +813,8 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; - /* - * If early loading microcode is supported, save this mc into - * permanent memory. So it will be loaded early when a CPU is hot added - * or resumes. - */ - save_mc_for_early(uci, new_mc, new_mc_size); + /* Save for CPU hotplug */ + save_microcode_patch(uci, new_mc, new_mc_size); pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h new file mode 100644 index 000000000000..bf883aa71233 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_MICROCODE_INTERNAL_H +#define _X86_MICROCODE_INTERNAL_H + +#include <linux/earlycpio.h> +#include <linux/initrd.h> + +#include <asm/cpu.h> +#include <asm/microcode.h> + +struct ucode_patch { + struct list_head plist; + void *data; /* Intel uses only this one */ + unsigned int size; + u32 patch_id; + u16 equiv_cpu; +}; + +extern struct list_head microcode_cache; + +struct device; + +enum ucode_state { + UCODE_OK = 0, + UCODE_NEW, + UCODE_UPDATED, + UCODE_NFOUND, + UCODE_ERROR, +}; + +struct microcode_ops { + enum ucode_state (*request_microcode_fw)(int cpu, struct device *dev); + + void (*microcode_fini_cpu)(int cpu); + + /* + * The generic 'microcode_core' part guarantees that + * the callbacks below run on a target cpu when they + * are being called. + * See also the "Synchronization" section in microcode_core.c. + */ + enum ucode_state (*apply_microcode)(int cpu); + int (*collect_cpu_info)(int cpu, struct cpu_signature *csig); +}; + +extern struct ucode_cpu_info ucode_cpu_info[]; +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa); + +#define MAX_UCODE_COUNT 128 + +#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) +#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') +#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') +#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') +#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') +#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') +#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') + +#define CPUID_IS(a, b, c, ebx, ecx, edx) \ + (!(((ebx) ^ (a)) | ((edx) ^ (b)) | ((ecx) ^ (c)))) + +/* + * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. + * x86_cpuid_vendor() gets vendor id for BSP. + * + * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify + * coding, we still use x86_cpuid_vendor() to get vendor id for AP. + * + * x86_cpuid_vendor() gets vendor information directly from CPUID. + */ +static inline int x86_cpuid_vendor(void) +{ + u32 eax = 0x00000000; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) + return X86_VENDOR_INTEL; + + if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) + return X86_VENDOR_AMD; + + return X86_VENDOR_UNKNOWN; +} + +static inline unsigned int x86_cpuid_family(void) +{ + u32 eax = 0x00000001; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + return x86_family(eax); +} + +extern bool initrd_gone; + +#ifdef CONFIG_CPU_SUP_AMD +void load_ucode_amd_bsp(unsigned int family); +void load_ucode_amd_ap(unsigned int family); +void load_ucode_amd_early(unsigned int cpuid_1_eax); +int save_microcode_in_initrd_amd(unsigned int family); +void reload_ucode_amd(unsigned int cpu); +struct microcode_ops *init_amd_microcode(void); +void exit_amd_microcode(void); +#else /* CONFIG_CPU_SUP_AMD */ +static inline void load_ucode_amd_bsp(unsigned int family) { } +static inline void load_ucode_amd_ap(unsigned int family) { } +static inline void load_ucode_amd_early(unsigned int family) { } +static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } +static inline void reload_ucode_amd(unsigned int cpu) { } +static inline struct microcode_ops *init_amd_microcode(void) { return NULL; } +static inline void exit_amd_microcode(void) { } +#endif /* !CONFIG_CPU_SUP_AMD */ + +#ifdef CONFIG_CPU_SUP_INTEL +void load_ucode_intel_bsp(void); +void load_ucode_intel_ap(void); +int save_microcode_in_initrd_intel(void); +void reload_ucode_intel(void); +struct microcode_ops *init_intel_microcode(void); +#else /* CONFIG_CPU_SUP_INTEL */ +static inline void load_ucode_intel_bsp(void) { } +static inline void load_ucode_intel_ap(void) { } +static inline int save_microcode_in_initrd_intel(void) { return -EINVAL; } +static inline void reload_ucode_intel(void) { } +static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } +#endif /* !CONFIG_CPU_SUP_INTEL */ + +#endif /* _X86_MICROCODE_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index c7969e806c64..0100468e72ca 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -119,7 +119,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) vmbus_handler(); if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) - ack_APIC_irq(); + apic_eoi(); set_irq_regs(old_regs); } @@ -147,7 +147,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) if (hv_stimer0_handler) hv_stimer0_handler(); add_interrupt_randomness(HYPERV_STIMER0_VECTOR); - ack_APIC_irq(); + apic_eoi(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 099b6f0d96bd..31c0e68f6227 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -4,6 +4,8 @@ #include <linux/string.h> #include <linux/seq_file.h> #include <linux/cpufreq.h> +#include <asm/prctl.h> +#include <linux/proc_fs.h> #include "cpu.h" @@ -175,3 +177,24 @@ const struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; + +#ifdef CONFIG_X86_USER_SHADOW_STACK +static void dump_x86_features(struct seq_file *m, unsigned long features) +{ + if (features & ARCH_SHSTK_SHSTK) + seq_puts(m, "shstk "); + if (features & ARCH_SHSTK_WRSS) + seq_puts(m, "wrss "); +} + +void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task) +{ + seq_puts(m, "x86_Thread_features:\t"); + dump_x86_features(m, task->thread.features); + seq_putc(m, '\n'); + + seq_puts(m, "x86_Thread_features_locked:\t"); + dump_x86_features(m, task->thread.features_locked); + seq_putc(m, '\n'); +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 458cb7419502..8f559eeae08e 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -45,7 +45,21 @@ static u64 prefetch_disable_bits; */ static unsigned int pseudo_lock_major; static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); -static struct class *pseudo_lock_class; + +static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) +{ + const struct rdtgroup *rdtgrp; + + rdtgrp = dev_get_drvdata(dev); + if (mode) + *mode = 0600; + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); +} + +static const struct class pseudo_lock_class = { + .name = "pseudo_lock", + .devnode = pseudo_lock_devnode, +}; /** * get_prefetch_disable_bits - prefetch disable bits of supported platforms @@ -1353,7 +1367,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) &pseudo_measure_fops); } - dev = device_create(pseudo_lock_class, NULL, + dev = device_create(&pseudo_lock_class, NULL, MKDEV(pseudo_lock_major, new_minor), rdtgrp, "%s", rdtgrp->kn->name); @@ -1383,7 +1397,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) goto out; out_device: - device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); out_debugfs: debugfs_remove_recursive(plr->debugfs_dir); pseudo_lock_minor_release(new_minor); @@ -1424,7 +1438,7 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) pseudo_lock_cstates_relax(plr); debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); - device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); pseudo_lock_minor_release(plr->minor); free: @@ -1560,16 +1574,6 @@ static const struct file_operations pseudo_lock_dev_fops = { .mmap = pseudo_lock_dev_mmap, }; -static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) -{ - const struct rdtgroup *rdtgrp; - - rdtgrp = dev_get_drvdata(dev); - if (mode) - *mode = 0600; - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); -} - int rdt_pseudo_lock_init(void) { int ret; @@ -1580,21 +1584,18 @@ int rdt_pseudo_lock_init(void) pseudo_lock_major = ret; - pseudo_lock_class = class_create("pseudo_lock"); - if (IS_ERR(pseudo_lock_class)) { - ret = PTR_ERR(pseudo_lock_class); + ret = class_register(&pseudo_lock_class); + if (ret) { unregister_chrdev(pseudo_lock_major, "pseudo_lock"); return ret; } - pseudo_lock_class->devnode = pseudo_lock_devnode; return 0; } void rdt_pseudo_lock_release(void) { - class_destroy(pseudo_lock_class); - pseudo_lock_class = NULL; + class_unregister(&pseudo_lock_class); unregister_chrdev(pseudo_lock_major, "pseudo_lock"); pseudo_lock_major = 0; } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index bdc0d5539b57..dae436253de4 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,7 +40,6 @@ #include <asm/processor.h> #include <asm/msr.h> -static struct class *cpuid_class; static enum cpuhp_state cpuhp_cpuid_state; struct cpuid_regs_done { @@ -124,26 +123,31 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; +static char *cpuid_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); +} + +static const struct class cpuid_class = { + .name = "cpuid", + .devnode = cpuid_devnode, +}; + static int cpuid_device_create(unsigned int cpu) { struct device *dev; - dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, + dev = device_create(&cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); return PTR_ERR_OR_ZERO(dev); } static int cpuid_device_destroy(unsigned int cpu) { - device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + device_destroy(&cpuid_class, MKDEV(CPUID_MAJOR, cpu)); return 0; } -static char *cpuid_devnode(const struct device *dev, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); -} - static int __init cpuid_init(void) { int err; @@ -154,12 +158,9 @@ static int __init cpuid_init(void) CPUID_MAJOR); return -EBUSY; } - cpuid_class = class_create("cpuid"); - if (IS_ERR(cpuid_class)) { - err = PTR_ERR(cpuid_class); + err = class_register(&cpuid_class); + if (err) goto out_chrdev; - } - cpuid_class->devnode = cpuid_devnode; err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online", cpuid_device_create, cpuid_device_destroy); @@ -170,7 +171,7 @@ static int __init cpuid_init(void) return 0; out_class: - class_destroy(cpuid_class); + class_unregister(&cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); return err; @@ -180,7 +181,7 @@ module_init(cpuid_init); static void __exit cpuid_exit(void) { cpuhp_remove_state(cpuhp_cpuid_state); - class_destroy(cpuid_class); + class_unregister(&cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); } module_exit(cpuid_exit); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index cdd92ab43cda..587c7743fd21 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -158,8 +158,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_save_cpu(regs, safe_smp_processor_id()); } -#ifdef CONFIG_KEXEC_FILE - +#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG) static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -231,7 +230,7 @@ static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) /* Prepare elf headers. Return addr and size */ static int prepare_elf_headers(struct kimage *image, void **addr, - unsigned long *sz) + unsigned long *sz, unsigned long *nr_mem_ranges) { struct crash_mem *cmem; int ret; @@ -249,6 +248,9 @@ static int prepare_elf_headers(struct kimage *image, void **addr, if (ret) goto out; + /* Return the computed number of memory ranges, for hotplug usage */ + *nr_mem_ranges = cmem->nr_ranges; + /* By default prepare 64bit headers */ ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); @@ -256,7 +258,9 @@ out: vfree(cmem); return ret; } +#endif +#ifdef CONFIG_KEXEC_FILE static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) { unsigned int nr_e820_entries; @@ -371,18 +375,42 @@ out: int crash_load_segments(struct kimage *image) { int ret; + unsigned long pnum = 0; struct kexec_buf kbuf = { .image = image, .buf_min = 0, .buf_max = ULONG_MAX, .top_down = false }; /* Prepare elf headers and add a segment */ - ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz); + ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz, &pnum); if (ret) return ret; - image->elf_headers = kbuf.buffer; - image->elf_headers_sz = kbuf.bufsz; + image->elf_headers = kbuf.buffer; + image->elf_headers_sz = kbuf.bufsz; + kbuf.memsz = kbuf.bufsz; + +#ifdef CONFIG_CRASH_HOTPLUG + /* + * The elfcorehdr segment size accounts for VMCOREINFO, kernel_map, + * maximum CPUs and maximum memory ranges. + */ + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) + pnum = 2 + CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES; + else + pnum += 2 + CONFIG_NR_CPUS_DEFAULT; + + if (pnum < (unsigned long)PN_XNUM) { + kbuf.memsz = pnum * sizeof(Elf64_Phdr); + kbuf.memsz += sizeof(Elf64_Ehdr); + + image->elfcorehdr_index = image->nr_segments; + + /* Mark as usable to crash kernel, else crash kernel fails on boot */ + image->elf_headers_sz = kbuf.memsz; + } else { + pr_err("number of Phdrs %lu exceeds max\n", pnum); + } +#endif - kbuf.memsz = kbuf.bufsz; kbuf.buf_align = ELF_CORE_HEADER_ALIGN; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; ret = kexec_add_buffer(&kbuf); @@ -395,3 +423,103 @@ int crash_load_segments(struct kimage *image) return ret; } #endif /* CONFIG_KEXEC_FILE */ + +#ifdef CONFIG_CRASH_HOTPLUG + +#undef pr_fmt +#define pr_fmt(fmt) "crash hp: " fmt + +/* These functions provide the value for the sysfs crash_hotplug nodes */ +#ifdef CONFIG_HOTPLUG_CPU +int arch_crash_hotplug_cpu_support(void) +{ + return crash_check_update_elfcorehdr(); +} +#endif + +#ifdef CONFIG_MEMORY_HOTPLUG +int arch_crash_hotplug_memory_support(void) +{ + return crash_check_update_elfcorehdr(); +} +#endif + +unsigned int arch_crash_get_elfcorehdr_size(void) +{ + unsigned int sz; + + /* kernel_map, VMCOREINFO and maximum CPUs */ + sz = 2 + CONFIG_NR_CPUS_DEFAULT; + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) + sz += CONFIG_CRASH_MAX_MEMORY_RANGES; + sz *= sizeof(Elf64_Phdr); + return sz; +} + +/** + * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes + * @image: a pointer to kexec_crash_image + * + * Prepare the new elfcorehdr and replace the existing elfcorehdr. + */ +void arch_crash_handle_hotplug_event(struct kimage *image) +{ + void *elfbuf = NULL, *old_elfcorehdr; + unsigned long nr_mem_ranges; + unsigned long mem, memsz; + unsigned long elfsz = 0; + + /* + * As crash_prepare_elf64_headers() has already described all + * possible CPUs, there is no need to update the elfcorehdr + * for additional CPU changes. + */ + if ((image->file_mode || image->elfcorehdr_updated) && + ((image->hp_action == KEXEC_CRASH_HP_ADD_CPU) || + (image->hp_action == KEXEC_CRASH_HP_REMOVE_CPU))) + return; + + /* + * Create the new elfcorehdr reflecting the changes to CPU and/or + * memory resources. + */ + if (prepare_elf_headers(image, &elfbuf, &elfsz, &nr_mem_ranges)) { + pr_err("unable to create new elfcorehdr"); + goto out; + } + + /* + * Obtain address and size of the elfcorehdr segment, and + * check it against the new elfcorehdr buffer. + */ + mem = image->segment[image->elfcorehdr_index].mem; + memsz = image->segment[image->elfcorehdr_index].memsz; + if (elfsz > memsz) { + pr_err("update elfcorehdr elfsz %lu > memsz %lu", + elfsz, memsz); + goto out; + } + + /* + * Copy new elfcorehdr over the old elfcorehdr at destination. + */ + old_elfcorehdr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT)); + if (!old_elfcorehdr) { + pr_err("mapping elfcorehdr segment failed\n"); + goto out; + } + + /* + * Temporarily invalidate the crash image while the + * elfcorehdr is updated. + */ + xchg(&kexec_crash_image, NULL); + memcpy_flushcache(old_elfcorehdr, elfbuf, elfsz); + xchg(&kexec_crash_image, image); + kunmap_local(old_elfcorehdr); + pr_debug("updated elfcorehdr\n"); + +out: + vfree(elfbuf); +} +#endif diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 28da5dd83fc0..87d38f17ff5c 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -128,16 +128,15 @@ static void __init dtb_setup_hpet(void) static void __init dtb_cpu_setup(void) { struct device_node *dn; - u32 apic_id, version; + u32 apic_id; - version = GET_APIC_VERSION(apic_read(APIC_LVR)); for_each_of_cpu_node(dn) { apic_id = of_get_cpu_hwid(dn, 0); if (apic_id == ~0U) { pr_warn("%pOF: missing local APIC ID\n", dn); continue; } - generic_processor_info(apic_id, version); + generic_processor_info(apic_id); } } @@ -158,19 +157,15 @@ static void __init dtb_lapic_setup(void) /* Did the boot loader setup the local APIC ? */ if (!boot_cpu_has(X86_FEATURE_APIC)) { - if (apic_force_enable(lapic_addr)) + /* Try force enabling, which registers the APIC address */ + if (!apic_force_enable(lapic_addr)) return; - } - smp_found_config = 1; - if (of_property_read_bool(dn, "intel,virtual-wire-mode")) { - pr_info("Virtual Wire compatibility mode.\n"); - pic_mode = 0; } else { - pr_info("IMCR and PIC compatibility mode.\n"); - pic_mode = 1; + register_lapic_address(lapic_addr); } - - register_lapic_address(lapic_addr); + smp_found_config = 1; + pic_mode = !of_property_read_bool(dn, "intel,virtual-wire-mode"); + pr_info("%s compatibility mode.\n", pic_mode ? "IMCR and PIC" : "Virtual Wire"); } #endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 98e507cc7d34..a86d37052a64 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -552,8 +552,36 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu) } } +/* A passed ssp of zero will not cause any update */ +static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) +{ +#ifdef CONFIG_X86_USER_SHADOW_STACK + struct cet_user_state *xstate; + + /* If ssp update is not needed. */ + if (!ssp) + return 0; + + xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, + XFEATURE_CET_USER); + + /* + * If there is a non-zero ssp, then 'dst' must be configured with a shadow + * stack and the fpu state should be up to date since it was just copied + * from the parent in fpu_clone(). So there must be a valid non-init CET + * state location in the buffer. + */ + if (WARN_ON_ONCE(!xstate)) + return 1; + + xstate->user_ssp = (u64)ssp; +#endif + return 0; +} + /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, + unsigned long ssp) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -613,6 +641,12 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) if (use_xsave()) dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; + /* + * Update shadow stack pointer, in case it changed during clone. + */ + if (update_fpu_shstk(dst, ssp)) + return 1; + trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -753,6 +787,24 @@ void switch_fpu_return(void) } EXPORT_SYMBOL_GPL(switch_fpu_return); +void fpregs_lock_and_load(void) +{ + /* + * fpregs_lock() only disables preemption (mostly). So modifying state + * in an interrupt could screw up some in progress fpregs operation. + * Warn about it. + */ + WARN_ON_ONCE(!irq_fpu_usable()); + WARN_ON_ONCE(current->flags & PF_KTHREAD); + + fpregs_lock(); + + fpregs_assert_state_consistent(); + + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); +} + #ifdef CONFIG_X86_DEBUG_FPU /* * If current FPU state according to its tracking (loaded FPU context on this diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 6d056b68f4ed..6bc1eb2a21bd 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -8,6 +8,7 @@ #include <asm/fpu/api.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/prctl.h> #include "context.h" #include "internal.h" @@ -174,6 +175,86 @@ out: return ret; } +#ifdef CONFIG_X86_USER_SHADOW_STACK +int ssp_active(struct task_struct *target, const struct user_regset *regset) +{ + if (target->thread.features & ARCH_SHSTK_SHSTK) + return regset->n; + + return 0; +} + +int ssp_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct fpu *fpu = &target->thread.fpu; + struct cet_user_state *cetregs; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -ENODEV; + + sync_fpstate(fpu); + cetregs = get_xsave_addr(&fpu->fpstate->regs.xsave, XFEATURE_CET_USER); + if (WARN_ON(!cetregs)) { + /* + * This shouldn't ever be NULL because shadow stack was + * verified to be enabled above. This means + * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so + * XFEATURE_CET_USER should not be in the init state. + */ + return -ENODEV; + } + + return membuf_write(&to, (unsigned long *)&cetregs->user_ssp, + sizeof(cetregs->user_ssp)); +} + +int ssp_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct xregs_state *xsave = &fpu->fpstate->regs.xsave; + struct cet_user_state *cetregs; + unsigned long user_ssp; + int r; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !ssp_active(target, regset)) + return -ENODEV; + + if (pos != 0 || count != sizeof(user_ssp)) + return -EINVAL; + + r = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_ssp, 0, -1); + if (r) + return r; + + /* + * Some kernel instructions (IRET, etc) can cause exceptions in the case + * of disallowed CET register values. Just prevent invalid values. + */ + if (user_ssp >= TASK_SIZE_MAX || !IS_ALIGNED(user_ssp, 8)) + return -EINVAL; + + fpu_force_restore(fpu); + + cetregs = get_xsave_addr(xsave, XFEATURE_CET_USER); + if (WARN_ON(!cetregs)) { + /* + * This shouldn't ever be NULL because shadow stack was + * verified to be enabled above. This means + * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so + * XFEATURE_CET_USER should not be in the init state. + */ + return -ENODEV; + } + + cetregs->user_ssp = user_ssp; + return 0; +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1afbc4866b10..cadf68737e6b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -39,26 +39,26 @@ */ static const char *xfeature_names[] = { - "x87 floating point registers" , - "SSE registers" , - "AVX registers" , - "MPX bounds registers" , - "MPX CSR" , - "AVX-512 opmask" , - "AVX-512 Hi256" , - "AVX-512 ZMM_Hi256" , - "Processor Trace (unused)" , + "x87 floating point registers", + "SSE registers", + "AVX registers", + "MPX bounds registers", + "MPX CSR", + "AVX-512 opmask", + "AVX-512 Hi256", + "AVX-512 ZMM_Hi256", + "Processor Trace (unused)", "Protection Keys User registers", "PASID state", - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "AMX Tile config" , - "AMX Tile data" , - "unknown xstate feature" , + "Control-flow User registers", + "Control-flow Kernel registers (unused)", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "AMX Tile config", + "AMX Tile data", + "unknown xstate feature", }; static unsigned short xsave_cpuid_features[] __initdata = { @@ -71,8 +71,9 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, - [XFEATURE_PKRU] = X86_FEATURE_PKU, + [XFEATURE_PKRU] = X86_FEATURE_OSPKE, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, + [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, }; @@ -276,6 +277,7 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); print_xstate_feature(XFEATURE_MASK_PASID); + print_xstate_feature(XFEATURE_MASK_CET_USER); print_xstate_feature(XFEATURE_MASK_XTILE_CFG); print_xstate_feature(XFEATURE_MASK_XTILE_DATA); } @@ -344,6 +346,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate) XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ + XFEATURE_MASK_CET_USER | \ XFEATURE_MASK_XTILE) /* @@ -446,14 +449,15 @@ static void __init __xstate_dump_leaves(void) } \ } while (0) -#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \ - if ((nr == nr_macro) && \ - WARN_ONCE(sz != sizeof(__struct), \ - "%s: struct is %zu bytes, cpu state %d bytes\n", \ - __stringify(nr_macro), sizeof(__struct), sz)) { \ +#define XCHECK_SZ(sz, nr, __struct) ({ \ + if (WARN_ONCE(sz != sizeof(__struct), \ + "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ + xfeature_names[nr], sizeof(__struct), sz)) { \ __xstate_dump_leaves(); \ } \ -} while (0) + true; \ +}) + /** * check_xtile_data_against_struct - Check tile data state size. @@ -527,36 +531,28 @@ static bool __init check_xstate_against_struct(int nr) * Ask the CPU for the size of the state. */ int sz = xfeature_size(nr); + /* * Match each CPU state with the corresponding software * structure. */ - XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct); - XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state); - XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state); - XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); - XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); - XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); - XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); - XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); - XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg); - - /* The tile data size varies between implementations. */ - if (nr == XFEATURE_XTILE_DATA) - check_xtile_data_against_struct(sz); - - /* - * Make *SURE* to add any feature numbers in below if - * there are "holes" in the xsave state component - * numbers. - */ - if ((nr < XFEATURE_YMM) || - (nr >= XFEATURE_MAX) || - (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || - ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { + switch (nr) { + case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); + case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); + case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); + case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); + case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); + case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); + case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); + case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); + case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); + case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); + case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; + default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } + return true; } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index c5b9289837dc..ea6995920b7a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -51,7 +51,9 @@ SYM_CODE_START_NOALIGN(startup_64) * for us. These identity mapped page tables map all of the * kernel pages and possibly all of memory. * - * %rsi holds a physical pointer to real_mode_data. + * %RSI holds the physical address of the boot_params structure + * provided by the bootloader. Preserve it in %R15 so C function calls + * will not clobber it. * * We come here either directly from a 64bit bootloader, or from * arch/x86/boot/compressed/head_64.S. @@ -62,6 +64,7 @@ SYM_CODE_START_NOALIGN(startup_64) * compiled to run at we first fixup the physical addresses in our page * tables and then reload them. */ + mov %rsi, %r15 /* Set up the stack for verify_cpu() */ leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp @@ -75,9 +78,7 @@ SYM_CODE_START_NOALIGN(startup_64) shrq $32, %rdx wrmsr - pushq %rsi call startup_64_setup_env - popq %rsi /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS @@ -93,12 +94,10 @@ SYM_CODE_START_NOALIGN(startup_64) * Activate SEV/SME memory encryption if supported/enabled. This needs to * be done now, since this also includes setup of the SEV-SNP CPUID table, * which needs to be done before any CPUID instructions are executed in - * subsequent code. + * subsequent code. Pass the boot_params pointer as the first argument. */ - movq %rsi, %rdi - pushq %rsi + movq %r15, %rdi call sme_enable - popq %rsi #endif /* Sanitize CPU configuration */ @@ -111,9 +110,8 @@ SYM_CODE_START_NOALIGN(startup_64) * programmed into CR3. */ leaq _text(%rip), %rdi - pushq %rsi + movq %r15, %rsi call __startup_64 - popq %rsi /* Form the CR3 value being sure to include the CR3 modifier */ addq $(early_top_pgt - __START_KERNEL_map), %rax @@ -127,8 +125,6 @@ SYM_CODE_START(secondary_startup_64) * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. * - * %rsi holds a physical pointer to real_mode_data. - * * We come here either from startup_64 (using physical addresses) * or from trampoline.S (using virtual addresses). * @@ -153,6 +149,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR + /* Clear %R15 which holds the boot_params pointer on the boot CPU */ + xorq %r15, %r15 + /* * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. @@ -199,13 +198,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * hypervisor could lie about the C-bit position to perform a ROP * attack on the guest by writing to the unencrypted stack and wait for * the next RET instruction. - * %rsi carries pointer to realmode data and is callee-clobbered. Save - * and restore it. */ - pushq %rsi movq %rax, %rdi call sev_verify_cbit - popq %rsi /* * Switch to new page-table @@ -365,9 +360,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) wrmsr /* Setup and Load IDT */ - pushq %rsi call early_setup_idt - popq %rsi /* Check if nx is implemented */ movl $0x80000001, %eax @@ -403,9 +396,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) pushq $0 popfq - /* rsi is pointer to real mode structure with interesting info. - pass it to C */ - movq %rsi, %rdi + /* Pass the boot_params pointer as first argument */ + movq %r15, %rdi .Ljump_to_C_code: /* diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c8eb1ac5125a..1648aa0204d9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -421,7 +421,7 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) * the IO_APIC has been initialized. */ hc->cpu = boot_cpu_data.cpu_index; - strncpy(hc->name, "hpet", sizeof(hc->name)); + strscpy(hc->name, "hpet", sizeof(hc->name)); hpet_init_clockevent(hc, 50); hc->evt.tick_resume = hpet_clkevt_legacy_resume; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 4d8aff05a509..30a55207c000 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -231,9 +231,7 @@ struct irq_chip i8259A_chip = { }; static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ +/* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ */ static void restore_ELCR(char *trigger) { outb(trigger[0], PIC_ELCR1); diff --git a/arch/x86/kernel/ibt_selftest.S b/arch/x86/kernel/ibt_selftest.S new file mode 100644 index 000000000000..c43c4ed28a9c --- /dev/null +++ b/arch/x86/kernel/ibt_selftest.S @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <linux/objtool.h> +#include <asm/nospec-branch.h> + +SYM_CODE_START(ibt_selftest_noendbr) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + /* #CP handler sets %ax to 0 */ + RET +SYM_CODE_END(ibt_selftest_noendbr) + +SYM_FUNC_START(ibt_selftest) + lea ibt_selftest_noendbr(%rip), %rax + ANNOTATE_RETPOLINE_SAFE + jmp *%rax +SYM_FUNC_END(ibt_selftest) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index a58c6bc1cd68..b786d48f5a0f 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -107,7 +107,7 @@ static const __initconst struct idt_data def_idts[] = { ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET INTG(X86_TRAP_CP, asm_exc_control_protection), #endif @@ -131,7 +131,6 @@ static const __initconst struct idt_data apic_idts[] = { INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi), INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), - INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup), INTG(REBOOT_VECTOR, asm_sysvec_reboot), #endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9f668d2f3d11..11761c124545 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -49,7 +49,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - ack_APIC_irq(); + apic_eoi(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) @@ -256,7 +256,7 @@ DEFINE_IDTENTRY_IRQ(common_interrupt) if (likely(!IS_ERR_OR_NULL(desc))) { handle_irq(desc, regs); } else { - ack_APIC_irq(); + apic_eoi(); if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", @@ -280,7 +280,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); + apic_eoi(); trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) @@ -310,7 +310,7 @@ EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) { - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(kvm_posted_intr_ipis); } @@ -319,7 +319,7 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) */ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) { - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(kvm_posted_intr_wakeup_ipis); kvm_posted_intr_wakeup_handler(); } @@ -329,7 +329,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) { - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(kvm_posted_intr_nested_ipis); } #endif @@ -401,6 +401,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) inc_irq_stat(irq_thermal_count); smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - ack_APIC_irq(); + apic_eoi(); } #endif diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 890d4778cd35..b0a24deab4a1 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -16,7 +16,7 @@ #ifdef CONFIG_X86_LOCAL_APIC DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work) { - ack_APIC_irq(); + apic_eoi(); trace_irq_work_entry(IRQ_WORK_VECTOR); inc_irq_stat(apic_irq_work_irqs); irq_work_run(); @@ -28,7 +28,7 @@ void arch_irq_work_raise(void) if (!arch_irq_work_has_interrupt()) return; - apic->send_IPI_self(IRQ_WORK_VECTOR); + __apic_send_IPI_self(IRQ_WORK_VECTOR); apic_wait_icr_idle(); } #endif diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 4eb8f2d19a87..578d16fc040f 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -101,10 +101,8 @@ static void __init jailhouse_get_smp_config(unsigned int early) register_lapic_address(0xfee00000); - for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) { - generic_processor_info(setup_data.v1.cpu_ids[cpu], - boot_cpu_apic_version); - } + for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) + generic_processor_info(setup_data.v1.cpu_ids[cpu]); smp_found_config = 1; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f7f6042eb7e6..e8babebad7b8 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -45,6 +45,7 @@ #include <linux/vmalloc.h> #include <linux/pgtable.h> #include <linux/set_memory.h> +#include <linux/cfi.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -293,7 +294,40 @@ static int can_probe(unsigned long paddr) #endif addr += insn.length; } + if (IS_ENABLED(CONFIG_CFI_CLANG)) { + /* + * The compiler generates the following instruction sequence + * for indirect call checks and cfi.c decodes this; + * + *Â movl -<id>, %r10d ; 6 bytes + * addl -4(%reg), %r10d ; 4 bytes + * je .Ltmp1 ; 2 bytes + * ud2 ; <- regs->ip + * .Ltmp1: + * + * Also, these movl and addl are used for showing expected + * type. So those must not be touched. + */ + __addr = recover_probed_instruction(buf, addr); + if (!__addr) + return 0; + + if (insn_decode_kernel(&insn, (void *)__addr) < 0) + return 0; + + if (insn.opcode.value == 0xBA) + offset = 12; + else if (insn.opcode.value == 0x3) + offset = 6; + else + goto out; + + /* This movl/addl is used for decoding CFI. */ + if (is_cfi_trap(addr + offset)) + return 0; + } +out: return (addr == paddr); } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1cceac5984da..b8ab9ee5896c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -291,7 +291,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) struct pt_regs *old_regs = set_irq_regs(regs); u32 token; - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(irq_hv_callback_count); @@ -332,7 +332,7 @@ static void kvm_register_steal_time(void) static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; -static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) +static notrace __maybe_unused void kvm_guest_apic_eoi_write(void) { /** * This relies on __test_and_clear_bit to modify the memory @@ -343,7 +343,7 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) */ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) return; - apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK); + apic_native_eoi(); } static void kvm_guest_cpu_init(void) @@ -622,10 +622,10 @@ late_initcall(setup_efi_kvm_sev_migration); /* * Set the IPI entry points */ -static void kvm_setup_pv_ipi(void) +static __init void kvm_setup_pv_ipi(void) { - apic->send_IPI_mask = kvm_send_ipi_mask; - apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; + apic_update_callback(send_IPI_mask, kvm_send_ipi_mask); + apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself); pr_info("setup PV IPIs\n"); } @@ -825,7 +825,7 @@ static void __init kvm_guest_init(void) } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - apic_set_eoi_write(kvm_guest_apic_eoi_write); + apic_update_callback(eoi, kvm_guest_apic_eoi_write); if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); @@ -966,10 +966,8 @@ static void __init kvm_init_platform(void) * Ensure that _bss_decrypted section is marked as decrypted in the * shared pages list. */ - nr_pages = DIV_ROUND_UP(__end_bss_decrypted - __start_bss_decrypted, - PAGE_SIZE); early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted, - nr_pages, 0); + __end_bss_decrypted - __start_bss_decrypted, 0); /* * If not booted using EFI, enable Live migration support. diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index fed721f90116..b223922248e9 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -48,7 +48,6 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __init MP_processor_info(struct mpc_cpu *m) { - int apicid; char *bootup_cpu = ""; if (!(m->cpuflag & CPU_ENABLED)) { @@ -56,15 +55,11 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - apicid = m->apicid; - - if (m->cpuflag & CPU_BOOTPROCESSOR) { + if (m->cpuflag & CPU_BOOTPROCESSOR) bootup_cpu = " (Bootup-CPU)"; - boot_cpu_physical_apicid = m->apicid; - } pr_info("Processor #%d%s\n", m->apicid, bootup_cpu); - generic_processor_info(apicid, m->apicver); + generic_processor_info(m->apicid); } #ifdef CONFIG_X86_IO_APIC @@ -380,11 +375,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) int i; /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* * 2 CPUs, numbered 0 & 1. */ processor.type = MP_PROCESSOR; @@ -525,10 +515,8 @@ void __init default_get_smp_config(unsigned int early) */ if (mpf->feature1) { if (early) { - /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + /* Local APIC has default address */ + register_lapic_address(APIC_DEFAULT_PHYS_BASE); goto out; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7bb17d37db01..e17c16c54a37 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -39,7 +39,6 @@ #include <asm/cpufeature.h> #include <asm/msr.h> -static struct class *msr_class; static enum cpuhp_state cpuhp_msr_state; enum allow_write_msrs { @@ -235,26 +234,31 @@ static const struct file_operations msr_fops = { .compat_ioctl = msr_ioctl, }; +static char *msr_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); +} + +static const struct class msr_class = { + .name = "msr", + .devnode = msr_devnode, +}; + static int msr_device_create(unsigned int cpu) { struct device *dev; - dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, + dev = device_create(&msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, "msr%d", cpu); return PTR_ERR_OR_ZERO(dev); } static int msr_device_destroy(unsigned int cpu) { - device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + device_destroy(&msr_class, MKDEV(MSR_MAJOR, cpu)); return 0; } -static char *msr_devnode(const struct device *dev, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); -} - static int __init msr_init(void) { int err; @@ -263,12 +267,9 @@ static int __init msr_init(void) pr_err("unable to get major %d for msr\n", MSR_MAJOR); return -EBUSY; } - msr_class = class_create("msr"); - if (IS_ERR(msr_class)) { - err = PTR_ERR(msr_class); + err = class_register(&msr_class); + if (err) goto out_chrdev; - } - msr_class->devnode = msr_devnode; err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online", msr_device_create, msr_device_destroy); @@ -278,7 +279,7 @@ static int __init msr_init(void) return 0; out_class: - class_destroy(msr_class); + class_unregister(&msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); return err; @@ -288,7 +289,7 @@ module_init(msr_init); static void __exit msr_exit(void) { cpuhp_remove_state(cpuhp_msr_state); - class_destroy(msr_class); + class_unregister(&msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); } module_exit(msr_exit) diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index a1a96df3dff1..e93a8545c74d 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -75,7 +75,7 @@ static void __init test_nmi_ipi(struct cpumask *mask) /* sync above data before sending NMI */ wmb(); - apic->send_IPI_mask(mask, NMI_VECTOR); + __apic_send_IPI_mask(mask, NMI_VECTOR); /* Don't wait longer than a second */ timeout = USEC_PER_SEC; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ac10b46c5832..975f98d5eee5 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -75,10 +75,16 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) + if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_disable(&virt_spin_lock_key); } +static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_page(tlb, table); +} + unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len) { @@ -295,8 +301,7 @@ struct paravirt_patch_template pv_ops = { .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, - .mmu.tlb_remove_table = - (void (*)(struct mmu_gather *, void *))tlb_remove_page, + .mmu.tlb_remove_table = native_tlb_remove_table, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index de6be0a3965e..f323d83e40a7 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -72,9 +72,15 @@ static inline void __init pci_swiotlb_detect(void) #endif /* CONFIG_SWIOTLB */ #ifdef CONFIG_SWIOTLB_XEN +static bool xen_swiotlb_enabled(void) +{ + return xen_initial_domain() || x86_swiotlb_enable || + (IS_ENABLED(CONFIG_XEN_PCIDEV_FRONTEND) && xen_pv_pci_possible); +} + static void __init pci_xen_swiotlb_init(void) { - if (!xen_initial_domain() && !x86_swiotlb_enable) + if (!xen_swiotlb_enabled()) return; x86_swiotlb_enable = true; x86_swiotlb_flags |= SWIOTLB_ANY; @@ -83,27 +89,6 @@ static void __init pci_xen_swiotlb_init(void) if (IS_ENABLED(CONFIG_PCI)) pci_request_acs(); } - -int pci_xen_swiotlb_init_late(void) -{ - if (dma_ops == &xen_swiotlb_dma_ops) - return 0; - - /* we can work with the default swiotlb */ - if (!io_tlb_default_mem.nslabs) { - int rc = swiotlb_init_late(swiotlb_size_or_default(), - GFP_KERNEL, xen_swiotlb_fixup); - if (rc < 0) - return rc; - } - - /* XXX: this switches the dma ops under live devices! */ - dma_ops = &xen_swiotlb_dma_ops; - if (IS_ENABLED(CONFIG_PCI)) - pci_request_acs(); - return 0; -} -EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); #else static inline void __init pci_xen_swiotlb_init(void) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 72015dba72ab..9f0909142a0a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -51,6 +51,7 @@ #include <asm/unwind.h> #include <asm/tdx.h> #include <asm/mmu_context.h> +#include <asm/shstk.h> #include "process.h" @@ -122,6 +123,7 @@ void exit_thread(struct task_struct *tsk) free_vm86(t); + shstk_free(tsk); fpu__drop(fpu); } @@ -162,6 +164,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; + unsigned long new_ssp; int ret = 0; childregs = task_pt_regs(p); @@ -199,7 +202,16 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->flags = X86_EFLAGS_FIXED; #endif - fpu_clone(p, clone_flags, args->fn); + /* + * Allocate a new shadow stack for thread if needed. If shadow stack, + * is disabled, new_ssp will remain 0, and fpu_clone() will know not to + * update it. + */ + new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size); + if (IS_ERR_VALUE(new_ssp)) + return PTR_ERR((void *)new_ssp); + + fpu_clone(p, clone_flags, args->fn, new_ssp); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { @@ -245,6 +257,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) io_bitmap_share(p); + /* + * If copy_thread() if failing, don't leak the shadow stack possibly + * allocated in shstk_alloc_thread_stack() above. + */ + if (ret) + shstk_free(p); + return ret; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3d181c16a2f6..33b268747bb7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -515,6 +515,8 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, load_gs_index(__USER_DS); } + reset_thread_features(); + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); @@ -894,6 +896,12 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) else return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); #endif + case ARCH_SHSTK_ENABLE: + case ARCH_SHSTK_DISABLE: + case ARCH_SHSTK_LOCK: + case ARCH_SHSTK_UNLOCK: + case ARCH_SHSTK_STATUS: + return shstk_prctl(task, option, arg2); default: ret = -EINVAL; break; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index dfaa270a7cc9..095f04bdabdc 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -58,6 +58,7 @@ enum x86_regset_64 { REGSET64_FP, REGSET64_IOPERM, REGSET64_XSTATE, + REGSET64_SSP, }; #define REGSET_GENERAL \ @@ -1267,6 +1268,17 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { .active = ioperm_active, .regset_get = ioperm_get }, +#ifdef CONFIG_X86_USER_SHADOW_STACK + [REGSET64_SSP] = { + .core_note_type = NT_X86_SHSTK, + .n = 1, + .size = sizeof(u64), + .align = sizeof(u64), + .active = ssp_active, + .regset_get = ssp_get, + .set = ssp_set + }, +#endif }; static const struct user_regset_view user_x86_64_view = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fd975a4a5200..b9145a63da77 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,7 +114,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 /* CPU data as detected by the assembly code in head_32.S */ struct cpuinfo_x86 new_cpu_data; -unsigned int def_to_bigsmp; struct apm_info apm_info; EXPORT_SYMBOL(apm_info); @@ -1018,9 +1017,11 @@ void __init setup_arch(char **cmdline_p) x86_report_nx(); + apic_setup_apic_calls(); + if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC - disable_apic = 1; + apic_is_disabled = true; #endif setup_clear_cpu_cap(X86_FEATURE_APIC); } @@ -1253,7 +1254,7 @@ void __init setup_arch(char **cmdline_p) map_vsyscall(); - generic_apic_probe(); + x86_32_probe_apic(); early_quirks(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index c242dc47e9cb..2c97bf7b56ae 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -181,15 +181,9 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_LOCAL_APIC per_cpu(x86_cpu_to_apicid, cpu) = early_per_cpu_map(x86_cpu_to_apicid, cpu); - per_cpu(x86_bios_cpu_apicid, cpu) = - early_per_cpu_map(x86_bios_cpu_apicid, cpu); per_cpu(x86_cpu_to_acpiid, cpu) = early_per_cpu_map(x86_cpu_to_acpiid, cpu); #endif -#ifdef CONFIG_X86_32 - per_cpu(x86_cpu_to_logical_apicid, cpu) = - early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); -#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); @@ -214,12 +208,8 @@ void __init setup_per_cpu_areas(void) /* indicate the early static arrays will soon be gone */ #ifdef CONFIG_X86_LOCAL_APIC early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; - early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL; #endif -#ifdef CONFIG_X86_32 - early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; -#endif #ifdef CONFIG_NUMA early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 1ee7bed453de..2787826d9f60 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1089,7 +1089,7 @@ static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) return ret; } -void snp_set_wakeup_secondary_cpu(void) +void __init snp_set_wakeup_secondary_cpu(void) { if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return; @@ -1099,7 +1099,7 @@ void snp_set_wakeup_secondary_cpu(void) * required method to start APs under SNP. If the hypervisor does * not support AP creation, then no APs will be started. */ - apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; + apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit); } int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) @@ -1575,6 +1575,9 @@ static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, long val, *reg = vc_insn_get_rm(ctxt); enum es_result ret; + if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) + return ES_VMM_ERROR; + if (!reg) return ES_DECODE_FAILED; @@ -1612,6 +1615,9 @@ static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, struct sev_es_runtime_data *data = this_cpu_read(runtime_data); long *reg = vc_insn_get_rm(ctxt); + if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) + return ES_VMM_ERROR; + if (!reg) return ES_DECODE_FAILED; diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c new file mode 100644 index 000000000000..fd689921a1db --- /dev/null +++ b/arch/x86/kernel/shstk.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * shstk.c - Intel shadow stack support + * + * Copyright (c) 2021, Intel Corporation. + * Yu-cheng Yu <[email protected]> + */ + +#include <linux/sched.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/sched/signal.h> +#include <linux/compat.h> +#include <linux/sizes.h> +#include <linux/user.h> +#include <linux/syscalls.h> +#include <asm/msr.h> +#include <asm/fpu/xstate.h> +#include <asm/fpu/types.h> +#include <asm/shstk.h> +#include <asm/special_insns.h> +#include <asm/fpu/api.h> +#include <asm/prctl.h> + +#define SS_FRAME_SIZE 8 + +static bool features_enabled(unsigned long features) +{ + return current->thread.features & features; +} + +static void features_set(unsigned long features) +{ + current->thread.features |= features; +} + +static void features_clr(unsigned long features) +{ + current->thread.features &= ~features; +} + +/* + * Create a restore token on the shadow stack. A token is always 8-byte + * and aligned to 8. + */ +static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) +{ + unsigned long addr; + + /* Token must be aligned */ + if (!IS_ALIGNED(ssp, 8)) + return -EINVAL; + + addr = ssp - SS_FRAME_SIZE; + + /* + * SSP is aligned, so reserved bits and mode bit are a zero, just mark + * the token 64-bit. + */ + ssp |= BIT(0); + + if (write_user_shstk_64((u64 __user *)addr, (u64)ssp)) + return -EFAULT; + + if (token_addr) + *token_addr = addr; + + return 0; +} + +/* + * VM_SHADOW_STACK will have a guard page. This helps userspace protect + * itself from attacks. The reasoning is as follows: + * + * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The + * INCSSP instruction can increment the shadow stack pointer. It is the + * shadow stack analog of an instruction like: + * + * addq $0x80, %rsp + * + * However, there is one important difference between an ADD on %rsp + * and INCSSP. In addition to modifying SSP, INCSSP also reads from the + * memory of the first and last elements that were "popped". It can be + * thought of as acting like this: + * + * READ_ONCE(ssp); // read+discard top element on stack + * ssp += nr_to_pop * 8; // move the shadow stack + * READ_ONCE(ssp-8); // read+discard last popped stack element + * + * The maximum distance INCSSP can move the SSP is 2040 bytes, before + * it would read the memory. Therefore a single page gap will be enough + * to prevent any operation from shifting the SSP to an adjacent stack, + * since it would have to land in the gap at least once, causing a + * fault. + */ +static unsigned long alloc_shstk(unsigned long addr, unsigned long size, + unsigned long token_offset, bool set_res_tok) +{ + int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; + struct mm_struct *mm = current->mm; + unsigned long mapped_addr, unused; + + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + mmap_write_lock(mm); + mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, + VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); + mmap_write_unlock(mm); + + if (!set_res_tok || IS_ERR_VALUE(mapped_addr)) + goto out; + + if (create_rstor_token(mapped_addr + token_offset, NULL)) { + vm_munmap(mapped_addr, size); + return -EINVAL; + } + +out: + return mapped_addr; +} + +static unsigned long adjust_shstk_size(unsigned long size) +{ + if (size) + return PAGE_ALIGN(size); + + return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); +} + +static void unmap_shadow_stack(u64 base, u64 size) +{ + int r; + + r = vm_munmap(base, size); + + /* + * mmap_write_lock_killable() failed with -EINTR. This means + * the process is about to die and have it's MM cleaned up. + * This task shouldn't ever make it back to userspace. In this + * case it is ok to leak a shadow stack, so just exit out. + */ + if (r == -EINTR) + return; + + /* + * For all other types of vm_munmap() failure, either the + * system is out of memory or there is bug. + */ + WARN_ON_ONCE(r); +} + +static int shstk_setup(void) +{ + struct thread_shstk *shstk = ¤t->thread.shstk; + unsigned long addr, size; + + /* Already enabled */ + if (features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + /* Also not supported for 32 bit and x32 */ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall()) + return -EOPNOTSUPP; + + size = adjust_shstk_size(0); + addr = alloc_shstk(0, size, 0, false); + if (IS_ERR_VALUE(addr)) + return PTR_ERR((void *)addr); + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, addr + size); + wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); + fpregs_unlock(); + + shstk->base = addr; + shstk->size = size; + features_set(ARCH_SHSTK_SHSTK); + + return 0; +} + +void reset_thread_features(void) +{ + memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk)); + current->thread.features = 0; + current->thread.features_locked = 0; +} + +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, + unsigned long stack_size) +{ + struct thread_shstk *shstk = &tsk->thread.shstk; + unsigned long addr, size; + + /* + * If shadow stack is not enabled on the new thread, skip any + * switch to a new shadow stack. + */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + /* + * For CLONE_VM, except vfork, the child needs a separate shadow + * stack. + */ + if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM) + return 0; + + size = adjust_shstk_size(stack_size); + addr = alloc_shstk(0, size, 0, false); + if (IS_ERR_VALUE(addr)) + return addr; + + shstk->base = addr; + shstk->size = size; + + return addr + size; +} + +static unsigned long get_user_shstk_addr(void) +{ + unsigned long long ssp; + + fpregs_lock_and_load(); + + rdmsrl(MSR_IA32_PL3_SSP, ssp); + + fpregs_unlock(); + + return ssp; +} + +#define SHSTK_DATA_BIT BIT(63) + +static int put_shstk_data(u64 __user *addr, u64 data) +{ + if (WARN_ON_ONCE(data & SHSTK_DATA_BIT)) + return -EINVAL; + + /* + * Mark the high bit so that the sigframe can't be processed as a + * return address. + */ + if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT)) + return -EFAULT; + return 0; +} + +static int get_shstk_data(unsigned long *data, unsigned long __user *addr) +{ + unsigned long ldata; + + if (unlikely(get_user(ldata, addr))) + return -EFAULT; + + if (!(ldata & SHSTK_DATA_BIT)) + return -EINVAL; + + *data = ldata & ~SHSTK_DATA_BIT; + + return 0; +} + +static int shstk_push_sigframe(unsigned long *ssp) +{ + unsigned long target_ssp = *ssp; + + /* Token must be aligned */ + if (!IS_ALIGNED(target_ssp, 8)) + return -EINVAL; + + *ssp -= SS_FRAME_SIZE; + if (put_shstk_data((void __user *)*ssp, target_ssp)) + return -EFAULT; + + return 0; +} + +static int shstk_pop_sigframe(unsigned long *ssp) +{ + struct vm_area_struct *vma; + unsigned long token_addr; + bool need_to_check_vma; + int err = 1; + + /* + * It is possible for the SSP to be off the end of a shadow stack by 4 + * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes + * before it, it might be this case, so check that the address being + * read is actually shadow stack. + */ + if (!IS_ALIGNED(*ssp, 8)) + return -EINVAL; + + need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp; + + if (need_to_check_vma) + mmap_read_lock_killable(current->mm); + + err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); + if (unlikely(err)) + goto out_err; + + if (need_to_check_vma) { + vma = find_vma(current->mm, *ssp); + if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) { + err = -EFAULT; + goto out_err; + } + + mmap_read_unlock(current->mm); + } + + /* Restore SSP aligned? */ + if (unlikely(!IS_ALIGNED(token_addr, 8))) + return -EINVAL; + + /* SSP in userspace? */ + if (unlikely(token_addr >= TASK_SIZE_MAX)) + return -EINVAL; + + *ssp = token_addr; + + return 0; +out_err: + if (need_to_check_vma) + mmap_read_unlock(current->mm); + return err; +} + +int setup_signal_shadow_stack(struct ksignal *ksig) +{ + void __user *restorer = ksig->ka.sa.sa_restorer; + unsigned long ssp; + int err; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + if (!restorer) + return -EINVAL; + + ssp = get_user_shstk_addr(); + if (unlikely(!ssp)) + return -EINVAL; + + err = shstk_push_sigframe(&ssp); + if (unlikely(err)) + return err; + + /* Push restorer address */ + ssp -= SS_FRAME_SIZE; + err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer); + if (unlikely(err)) + return -EFAULT; + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return 0; +} + +int restore_signal_shadow_stack(void) +{ + unsigned long ssp; + int err; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + ssp = get_user_shstk_addr(); + if (unlikely(!ssp)) + return -EINVAL; + + err = shstk_pop_sigframe(&ssp); + if (unlikely(err)) + return err; + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return 0; +} + +void shstk_free(struct task_struct *tsk) +{ + struct thread_shstk *shstk = &tsk->thread.shstk; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return; + + /* + * When fork() with CLONE_VM fails, the child (tsk) already has a + * shadow stack allocated, and exit_thread() calls this function to + * free it. In this case the parent (current) and the child share + * the same mm struct. + */ + if (!tsk->mm || tsk->mm != current->mm) + return; + + unmap_shadow_stack(shstk->base, shstk->size); +} + +static int wrss_control(bool enable) +{ + u64 msrval; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + /* + * Only enable WRSS if shadow stack is enabled. If shadow stack is not + * enabled, WRSS will already be disabled, so don't bother clearing it + * when disabling. + */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -EPERM; + + /* Already enabled/disabled? */ + if (features_enabled(ARCH_SHSTK_WRSS) == enable) + return 0; + + fpregs_lock_and_load(); + rdmsrl(MSR_IA32_U_CET, msrval); + + if (enable) { + features_set(ARCH_SHSTK_WRSS); + msrval |= CET_WRSS_EN; + } else { + features_clr(ARCH_SHSTK_WRSS); + if (!(msrval & CET_WRSS_EN)) + goto unlock; + + msrval &= ~CET_WRSS_EN; + } + + wrmsrl(MSR_IA32_U_CET, msrval); + +unlock: + fpregs_unlock(); + + return 0; +} + +static int shstk_disable(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + /* Already disabled? */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + fpregs_lock_and_load(); + /* Disable WRSS too when disabling shadow stack */ + wrmsrl(MSR_IA32_U_CET, 0); + wrmsrl(MSR_IA32_PL3_SSP, 0); + fpregs_unlock(); + + shstk_free(current); + features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS); + + return 0; +} + +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) +{ + bool set_tok = flags & SHADOW_STACK_SET_TOKEN; + unsigned long aligned_size; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + if (flags & ~SHADOW_STACK_SET_TOKEN) + return -EINVAL; + + /* If there isn't space for a token */ + if (set_tok && size < 8) + return -ENOSPC; + + if (addr && addr < SZ_4G) + return -ERANGE; + + /* + * An overflow would result in attempting to write the restore token + * to the wrong location. Not catastrophic, but just return the right + * error code and block it. + */ + aligned_size = PAGE_ALIGN(size); + if (aligned_size < size) + return -EOVERFLOW; + + return alloc_shstk(addr, aligned_size, size, set_tok); +} + +long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) +{ + unsigned long features = arg2; + + if (option == ARCH_SHSTK_STATUS) { + return put_user(task->thread.features, (unsigned long __user *)arg2); + } + + if (option == ARCH_SHSTK_LOCK) { + task->thread.features_locked |= features; + return 0; + } + + /* Only allow via ptrace */ + if (task != current) { + if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) { + task->thread.features_locked &= ~features; + return 0; + } + return -EINVAL; + } + + /* Do not allow to change locked features */ + if (features & task->thread.features_locked) + return -EPERM; + + /* Only support enabling/disabling one feature at a time. */ + if (hweight_long(features) > 1) + return -EINVAL; + + if (option == ARCH_SHSTK_DISABLE) { + if (features & ARCH_SHSTK_WRSS) + return wrss_control(false); + if (features & ARCH_SHSTK_SHSTK) + return shstk_disable(); + return -EINVAL; + } + + /* Handle ARCH_SHSTK_ENABLE */ + if (features & ARCH_SHSTK_SHSTK) + return shstk_setup(); + if (features & ARCH_SHSTK_WRSS) + return wrss_control(true); + return -EINVAL; +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cfeec3ee877e..65fe2094da59 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -40,6 +40,7 @@ #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> +#include <asm/shstk.h> static inline int is_ia32_compat_frame(struct ksignal *ksig) { diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 9027fc088f97..c12624bc82a3 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -402,7 +402,7 @@ Efault: */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 13a1e6083837..cacf2ede6217 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -175,6 +175,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); uc_flags = frame_uc_flags(regs); + if (setup_signal_shadow_stack(ksig)) + return -EFAULT; + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; @@ -260,6 +263,9 @@ SYSCALL_DEFINE0(rt_sigreturn) if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; + if (restore_signal_shadow_stack()) + goto badframe; + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; @@ -403,7 +409,7 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 7eb18ca7bd45..6eb06d001bcc 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -135,7 +135,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) */ DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { - ack_APIC_irq(); + apic_eoi(); cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); } @@ -237,7 +237,7 @@ static void native_stop_other_cpus(int wait) pr_emerg("Shutting down cpus with NMI\n"); for_each_cpu(cpu, &cpus_stop_mask) - apic->send_IPI(cpu, NMI_VECTOR); + __apic_send_IPI(cpu, NMI_VECTOR); } /* * Don't wait longer than 10 ms if the caller didn't @@ -268,7 +268,7 @@ done: */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) { - ack_APIC_irq(); + apic_eoi(); trace_reschedule_entry(RESCHEDULE_VECTOR); inc_irq_stat(irq_resched_count); scheduler_ipi(); @@ -277,7 +277,7 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) { - ack_APIC_irq(); + apic_eoi(); trace_call_function_entry(CALL_FUNCTION_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_interrupt(); @@ -286,7 +286,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) { - ack_APIC_irq(); + apic_eoi(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_single_interrupt(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e1aa2cd7734b..d7667a29acf3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -327,14 +327,6 @@ static void notrace start_secondary(void *unused) } /** - * topology_smt_supported - Check whether SMT is supported by the CPUs - */ -bool topology_smt_supported(void) -{ - return smp_num_siblings > 1; -} - -/** * topology_phys_to_logical_pkg - Map a physical package id to a logical * @phys_pkg: The physical package id to map * @@ -422,7 +414,7 @@ found: return 0; } -void __init smp_store_boot_cpu_info(void) +static void __init smp_store_boot_cpu_info(void) { int id = 0; /* CPU 0 */ struct cpuinfo_x86 *c = &cpu_data(id); @@ -632,14 +624,9 @@ static void __init build_sched_topology(void) }; #endif #ifdef CONFIG_SCHED_CLUSTER - /* - * For now, skip the cluster domain on Hybrid. - */ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) - }; - } + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) + }; #endif #ifdef CONFIG_SCHED_MC x86_topology[i++] = (struct sched_domain_topology_level){ @@ -774,44 +761,6 @@ static void impress_friends(void) pr_debug("Before bogocount - setting activated=1\n"); } -void __inquire_remote_apic(int apicid) -{ - unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; - const char * const names[] = { "ID", "VERSION", "SPIV" }; - int timeout; - u32 status; - - pr_info("Inquiring remote APIC 0x%x...\n", apicid); - - for (i = 0; i < ARRAY_SIZE(regs); i++) { - pr_info("... APIC 0x%x %s: ", apicid, names[i]); - - /* - * Wait for idle. - */ - status = safe_apic_wait_icr_idle(); - if (status) - pr_cont("a previous APIC delivery may have failed\n"); - - apic_icr_write(APIC_DM_REMRD | regs[i], apicid); - - timeout = 0; - do { - udelay(100); - status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; - } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); - - switch (status) { - case APIC_ICR_RR_VALID: - status = apic_read(APIC_RRR); - pr_cont("%08x\n", status); - break; - default: - pr_cont("failed\n"); - } - } -} - /* * The Multiprocessor Specification 1.4 (1997) example code suggests * that there should be a 10ms delay between the BSP asserting INIT @@ -1102,9 +1051,8 @@ int native_kick_ap(unsigned int cpu, struct task_struct *tidle) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); - if (apicid == BAD_APICID || - !physid_isset(apicid, phys_cpu_present_map) || - !apic->apic_id_valid(apicid)) { + if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) || + !apic_id_valid(apicid)) { pr_err("%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } @@ -1187,58 +1135,6 @@ static __init void disable_smp(void) cpumask_set_cpu(0, topology_die_cpumask(0)); } -/* - * Various sanity checks. - */ -static void __init smp_sanity_check(void) -{ - preempt_disable(); - -#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32) - if (def_to_bigsmp && nr_cpu_ids > 8) { - unsigned int cpu; - unsigned nr; - - pr_warn("More than 8 CPUs detected - skipping them\n" - "Use CONFIG_X86_BIGSMP\n"); - - nr = 0; - for_each_present_cpu(cpu) { - if (nr >= 8) - set_cpu_present(cpu, false); - nr++; - } - - nr = 0; - for_each_possible_cpu(cpu) { - if (nr >= 8) - set_cpu_possible(cpu, false); - nr++; - } - - set_nr_cpu_ids(8); - } -#endif - - if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n", - hard_smp_processor_id()); - - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - - /* - * Should not be necessary because the MP table should list the boot - * CPU too, but we do it for the sake of robustness anyway. - */ - if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { - pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", - boot_cpu_physical_apicid); - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - preempt_enable(); -} - static void __init smp_cpu_index_default(void) { int i; @@ -1298,8 +1194,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - smp_sanity_check(); - switch (apic_intr_mode) { case APIC_PIC: case APIC_VIRTUAL_WIRE_NO_CONFIG: @@ -1435,24 +1329,6 @@ __init void prefill_possible_map(void) { int i, possible; - /* No boot processor was found in mptable or ACPI MADT */ - if (!num_processors) { - if (boot_cpu_has(X86_FEATURE_APIC)) { - int apicid = boot_cpu_physical_apicid; - int cpu = hard_smp_processor_id(); - - pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu); - - /* Make sure boot cpu is enumerated */ - if (apic->cpu_present_to_apicid(0) == BAD_APICID && - apic->apic_id_valid(apicid)) - generic_processor_info(apicid, boot_cpu_apic_version); - } - - if (!num_processors) - num_processors = 1; - } - i = setup_max_cpus ?: 1; if (setup_possible_cpus == -1) { possible = num_processors; @@ -1614,9 +1490,7 @@ void play_dead_common(void) idle_task_exit(); cpuhp_ap_report_dead(); - /* - * With physical CPU hotplug, we should halt the cpu - */ + local_irq_disable(); } diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 8cc653ffdccd..c783aeb37dce 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -193,7 +193,11 @@ get_unmapped_area: info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = PAGE_SIZE; + if (!in_32bit_syscall() && (flags & MAP_ABOVE4G)) + info.low_limit = SZ_4G; + else + info.low_limit = PAGE_SIZE; + info.high_limit = get_mmap_base(0); /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4a817d20ce3b..c876f1d36a81 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -77,18 +77,6 @@ DECLARE_BITMAP(system_vectors, NR_VECTORS); -static inline void cond_local_irq_enable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void cond_local_irq_disable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); -} - __always_inline int is_valid_bugaddr(unsigned long addr) { if (addr < TASK_SIZE_MAX) @@ -213,81 +201,6 @@ DEFINE_IDTENTRY(exc_overflow) do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } -#ifdef CONFIG_X86_KERNEL_IBT - -static __ro_after_init bool ibt_fatal = true; - -extern void ibt_selftest_ip(void); /* code label defined in asm below */ - -enum cp_error_code { - CP_EC = (1 << 15) - 1, - - CP_RET = 1, - CP_IRET = 2, - CP_ENDBR = 3, - CP_RSTRORSSP = 4, - CP_SETSSBSY = 5, - - CP_ENCL = 1 << 15, -}; - -DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) -{ - if (!cpu_feature_enabled(X86_FEATURE_IBT)) { - pr_err("Unexpected #CP\n"); - BUG(); - } - - if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR)) - return; - - if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) { - regs->ax = 0; - return; - } - - pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); - if (!ibt_fatal) { - printk(KERN_DEFAULT CUT_HERE); - __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); - return; - } - BUG(); -} - -/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */ -noinline bool ibt_selftest(void) -{ - unsigned long ret; - - asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t" - ANNOTATE_RETPOLINE_SAFE - " jmp *%%rax\n\t" - "ibt_selftest_ip:\n\t" - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - " nop\n\t" - - : "=a" (ret) : : "memory"); - - return !ret; -} - -static int __init ibt_setup(char *str) -{ - if (!strcmp(str, "off")) - setup_clear_cpu_cap(X86_FEATURE_IBT); - - if (!strcmp(str, "warn")) - ibt_fatal = false; - - return 1; -} - -__setup("ibt=", ibt_setup); - -#endif /* CONFIG_X86_KERNEL_IBT */ - #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3425c6a943e4..15f97c0abc9d 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1258,7 +1258,7 @@ static void __init check_system_tsc_reliable(void) if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && boot_cpu_has(X86_FEATURE_TSC_ADJUST) && - nr_online_nodes <= 2) + nr_online_nodes <= 4) tsc_disable_clocksource_watchdog(); } diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 796cfaa46bfa..65e96b76c423 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -129,7 +129,7 @@ static void __init vsmp_cap_cpus(void) static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) { - return hard_smp_processor_id() >> index_msb; + return read_apic_id() >> index_msb; } static void vsmp_apic_post_init(void) |