diff options
Diffstat (limited to 'arch/x86/kernel')
141 files changed, 5389 insertions, 3776 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 581386c7e429..a01892bdd61a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg +CFLAGS_REMOVE_head64.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -27,8 +28,9 @@ KASAN_SANITIZE_stacktrace.o := n OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y -OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y +OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, @@ -46,7 +48,7 @@ obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o -obj-$(CONFIG_X86_64) += sys_x86_64.o mcount_64.o +obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o @@ -82,6 +84,7 @@ obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_LIVEPATCH) += livepatch.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o @@ -100,8 +103,6 @@ obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_AMD_NB) += amd_nb.o -obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o -obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 26b78d86f25a..85a9e17e0dbc 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,3 +1,5 @@ +OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y + obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_APEI) += apei.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 64422f850e95..7491e73d9253 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -35,7 +35,9 @@ #include <linux/bootmem.h> #include <linux/ioport.h> #include <linux/pci.h> +#include <linux/efi-bgrt.h> +#include <asm/e820/api.h> #include <asm/irqdomain.h> #include <asm/pci_x86.h> #include <asm/pgtable.h> @@ -178,10 +180,15 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } + if (!enabled) { + ++disabled_cpus; + return -EINVAL; + } + if (boot_cpu_physical_apicid != -1U) ver = boot_cpu_apic_version; - cpu = __generic_processor_info(id, ver, enabled); + cpu = generic_processor_info(id, ver); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; @@ -340,6 +347,14 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, struct mpc_intsrc mp_irq; /* + * Check bus_irq boundary. + */ + if (bus_irq >= NR_IRQS_LEGACY) { + pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq); + return; + } + + /* * Convert 'gsi' to 'ioapic.pin'. */ ioapic = mp_find_ioapic(gsi); @@ -709,7 +724,7 @@ static void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -723,11 +738,12 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) return 0; } -int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, + int *pcpu) { int cpu; - cpu = acpi_register_lapic(physid, U32_MAX, ACPI_MADT_ENABLED); + cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED); if (cpu < 0) { pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); return cpu; @@ -1581,6 +1597,8 @@ int __init acpi_boot_init(void) acpi_process_madt(); acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); + if (IS_ENABLED(CONFIG_ACPI_BGRT)) + acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); if (!acpi_noirq) x86_init.pci.init = pci_acpi_init; @@ -1714,6 +1732,6 @@ int __acpi_release_global_lock(unsigned int *lock) void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) { - e820_add_region(addr, size, E820_ACPI); - update_e820(); + e820__range_add(addr, size, E820_TYPE_ACPI); + e820__update_table_print(); } diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index af15f4444330..dde437f5d14f 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -12,7 +12,6 @@ #include <linux/sched.h> #include <acpi/processor.h> -#include <asm/acpi.h> #include <asm/mwait.h> #include <asm/special_insns.h> @@ -89,7 +88,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) retval = 0; /* If the HW does not support any sub-states in this C-state */ if (num_cstate_subtype == 0) { - pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part); + pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", + cx->address, edx_part); retval = -1; goto out; } @@ -104,8 +104,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) if (!mwait_supported[cstate_type]) { mwait_supported[cstate_type] = 1; printk(KERN_DEBUG - "Monitor-Mwait will be used to enter C-%d " - "state\n", cx->type); + "Monitor-Mwait will be used to enter C-%d state\n", + cx->type); } snprintf(cx->desc, ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x", @@ -166,7 +166,9 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter); static int __init ffh_cstate_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; - if (c->x86_vendor != X86_VENDOR_INTEL) + + if (c->x86_vendor != X86_VENDOR_INTEL && + c->x86_vendor != X86_VENDOR_AMD) return -1; cpu_cstate_entry = alloc_percpu(struct cstate_entry); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 48587335ede8..ed014814ea35 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -101,7 +101,7 @@ int x86_acpi_suspend_lowlevel(void) #ifdef CONFIG_SMP initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = - (unsigned long)get_cpu_gdt_table(smp_processor_id()); + (unsigned long)get_cpu_gdt_rw(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c5b8f760473c..32e14d137416 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -409,8 +409,13 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, memcpy(insnbuf, replacement, a->replacementlen); insnbuf_sz = a->replacementlen; - /* 0xe8 is a relative jump; fix the offset. */ - if (*insnbuf == 0xe8 && a->replacementlen == 5) { + /* + * 0xe8 is a relative jump; fix the offset. + * + * Instruction length is checked before the opcode to avoid + * accessing uninitialized bytes for zero-length replacements. + */ + if (a->replacementlen == 5 && *insnbuf == 0xe8) { *(s32 *)(insnbuf + 1) += replacement - instr; DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", *(s32 *)(insnbuf + 1), diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 63ff468a7986..cc0e8bc0ea3f 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -17,6 +17,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/pci.h> @@ -35,7 +36,7 @@ #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/amd_nb.h> @@ -695,7 +696,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) return -1; } -static struct dma_map_ops gart_dma_ops = { +static const struct dma_map_ops gart_dma_ops = { .map_sg = gart_map_sg, .unmap_sg = gart_unmap_sg, .map_page = gart_map_page, @@ -703,6 +704,7 @@ static struct dma_map_ops gart_dma_ops = { .alloc = gart_alloc_coherent, .free = gart_free_coherent, .mapping_error = gart_mapping_error, + .dma_supported = x86_dma_supported, }; static void gart_iommu_shutdown(void) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 0a2bb1f62e72..ef2859f9fcce 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -21,7 +21,7 @@ #include <linux/pci.h> #include <linux/bitops.h> #include <linux/suspend.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/io.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -306,13 +306,13 @@ void __init early_gart_iommu_check(void) fix = 1; if (gart_fix_e820 && !fix && aper_enabled) { - if (e820_any_mapped(aper_base, aper_base + aper_size, - E820_RAM)) { + if (e820__mapped_any(aper_base, aper_base + aper_size, + E820_TYPE_RAM)) { /* reserve it, so we can reuse it in second kernel */ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", aper_base, aper_base + aper_size - 1); - e820_add_region(aper_base, aper_size, E820_RESERVED); - update_e820(); + e820__range_add(aper_base, aper_size, E820_TYPE_RESERVED); + e820__update_table_print(); } } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 5b7e43eff139..98b3dd8cf2bf 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -54,6 +54,8 @@ #include <asm/mce.h> #include <asm/tsc.h> #include <asm/hypervisor.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> unsigned int num_processors; @@ -529,21 +531,97 @@ static void lapic_timer_broadcast(const struct cpumask *mask) * The local apic timer can be used for any function which is CPU local. */ static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | - CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP - | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_state_shutdown = lapic_timer_shutdown, - .set_state_periodic = lapic_timer_set_periodic, - .set_state_oneshot = lapic_timer_set_oneshot, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP + | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_state_shutdown = lapic_timer_shutdown, + .set_state_periodic = lapic_timer_set_periodic, + .set_state_oneshot = lapic_timer_set_oneshot, + .set_state_oneshot_stopped = lapic_timer_shutdown, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); +#define DEADLINE_MODEL_MATCH_FUNC(model, func) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&func } + +#define DEADLINE_MODEL_MATCH_REV(model, rev) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev } + +static u32 hsx_deadline_rev(void) +{ + switch (boot_cpu_data.x86_mask) { + case 0x02: return 0x3a; /* EP */ + case 0x04: return 0x0f; /* EX */ + } + + return ~0U; +} + +static u32 bdx_deadline_rev(void) +{ + switch (boot_cpu_data.x86_mask) { + case 0x02: return 0x00000011; + case 0x03: return 0x0700000e; + case 0x04: return 0x0f00000c; + case 0x05: return 0x0e000003; + } + + return ~0U; +} + +static const struct x86_cpu_id deadline_match[] = { + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52), + + {}, +}; + +static void apic_check_deadline_errata(void) +{ + const struct x86_cpu_id *m = x86_match_cpu(deadline_match); + u32 rev; + + if (!m) + return; + + /* + * Function pointers will have the MSB set due to address layout, + * immediate revisions will not. + */ + if ((long)m->driver_data < 0) + rev = ((u32 (*)(void))(m->driver_data))(); + else + rev = (u32)m->driver_data; + + if (boot_cpu_data.microcode >= rev) + return; + + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; " + "please update microcode to version: 0x%x (or later)\n", rev); +} + /* * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. @@ -562,6 +640,7 @@ static void setup_APIC_timer(void) levt->cpumask = cpumask_of(smp_processor_id()); if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + levt->name = "lapic-deadline"; levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_DUMMY); levt->set_next_event = lapic_next_deadline; @@ -730,8 +809,10 @@ static int __init calibrate_APIC_clock(void) TICK_NSEC, lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFF; lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; return 0; } @@ -777,8 +858,10 @@ static int __init calibrate_APIC_clock(void) lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; @@ -1245,7 +1328,7 @@ static void lapic_setup_esr(void) /** * setup_local_APIC - setup the local APIC * - * Used to setup local APIC while initializing BSP or bringin up APs. + * Used to setup local APIC while initializing BSP or bringing up APs. * Always called with preemption disabled. */ void setup_local_APIC(void) @@ -1609,24 +1692,15 @@ static inline void try_to_enable_x2apic(int remap_mode) { } static inline void __x2apic_enable(void) { } #endif /* !CONFIG_X86_X2APIC */ -static int __init try_to_enable_IR(void) -{ -#ifdef CONFIG_X86_IO_APIC - if (!x2apic_enabled() && skip_ioapic_setup) { - pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); - return -1; - } -#endif - return irq_remapping_enable(); -} - void __init enable_IR_x2apic(void) { unsigned long flags; int ret, ir_stat; - if (skip_ioapic_setup) + if (skip_ioapic_setup) { + pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); return; + } ir_stat = irq_remapping_prepare(); if (ir_stat < 0 && !x2apic_supported()) @@ -1644,7 +1718,7 @@ void __init enable_IR_x2apic(void) /* If irq_remapping_prepare() succeeded, try to enable it */ if (ir_stat >= 0) - ir_stat = try_to_enable_IR(); + ir_stat = irq_remapping_enable(); /* ir_stat contains the remap mode or an error code */ try_to_enable_x2apic(ir_stat); @@ -1783,6 +1857,8 @@ void __init init_apic_mappings(void) { unsigned int new_apicid; + apic_check_deadline_errata(); + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; @@ -1797,8 +1873,8 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; /* - * acpi lapic path already maps that address in - * acpi_register_lapic_address() + * If the system has ACPI MADT tables or MP info, the LAPIC + * address is already registered. */ if (!acpi_lapic && !smp_found_config) register_lapic_address(apic_phys); @@ -1864,14 +1940,14 @@ static void __smp_spurious_interrupt(u8 vector) "should never happen.\n", vector, smp_processor_id()); } -__visible void smp_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) { entering_irq(); __smp_spurious_interrupt(~regs->orig_ax); exiting_irq(); } -__visible void smp_trace_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) { u8 vector = ~regs->orig_ax; @@ -1922,14 +1998,14 @@ static void __smp_error_interrupt(struct pt_regs *regs) } -__visible void smp_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) { entering_irq(); __smp_error_interrupt(regs); exiting_irq(); } -__visible void smp_trace_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) { entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); @@ -2028,8 +2104,8 @@ void disconnect_bsp_APIC(int virt_wire_setup) /* * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated * contiguously, it equals to current allocated max logical CPU ID plus 1. - * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of - * nr_logical_cpuids is nr_cpu_ids. + * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range, + * so the maximum of nr_logical_cpuids is nr_cpu_ids. * * NOTE: Reserve 0 for BSP. */ @@ -2061,17 +2137,17 @@ static int allocate_logical_cpuid(int apicid) /* Allocate a new cpuid. */ if (nr_logical_cpuids >= nr_cpu_ids) { - WARN_ONCE(1, "Only %d processors supported." + WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. " "Processor %d/0x%x and the rest are ignored.\n", - nr_cpu_ids - 1, nr_logical_cpuids, apicid); - return -1; + nr_cpu_ids, nr_logical_cpuids, apicid); + return -EINVAL; } cpuid_to_apicid[nr_logical_cpuids] = apicid; return nr_logical_cpuids++; } -int __generic_processor_info(int apicid, int version, bool enabled) +int generic_processor_info(int apicid, int version) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2094,7 +2170,7 @@ int __generic_processor_info(int apicid, int version, bool enabled) * Since fixing handling of boot_cpu_physical_apicid requires * another discussion and tests on each platform, we leave it * for now and here we use read_apic_id() directly in this - * function, generic_processor_info(). + * function, __generic_processor_info(). */ if (disabled_cpu_apicid != BAD_APICID && disabled_cpu_apicid != read_apic_id() && @@ -2129,11 +2205,9 @@ int __generic_processor_info(int apicid, int version, bool enabled) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - if (enabled) { - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); - } + pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " + "reached. Processor %d/0x%x ignored.\n", + max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2185,23 +2259,13 @@ int __generic_processor_info(int apicid, int version, bool enabled) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - - if (enabled) { - num_processors++; - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - } else { - disabled_cpus++; - } + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; return cpu; } -int generic_processor_info(int apicid, int version) -{ - return __generic_processor_info(apicid, version, true); -} - int hard_smp_processor_id(void) { return read_apic_id(); @@ -2217,23 +2281,32 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } -int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid) +int default_cpu_mask_to_apicid(const struct cpumask *mask, + struct irq_data *irqdata, + unsigned int *apicid) { - unsigned int cpu; + unsigned int cpu = cpumask_first(mask); - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } + if (cpu >= nr_cpu_ids) + return -EINVAL; + *apicid = per_cpu(x86_cpu_to_apicid, cpu); + irq_data_update_effective_affinity(irqdata, cpumask_of(cpu)); + return 0; +} - if (likely(cpu < nr_cpu_ids)) { - *apicid = per_cpu(x86_cpu_to_apicid, cpu); - return 0; - } +int flat_cpu_mask_to_apicid(const struct cpumask *mask, + struct irq_data *irqdata, + unsigned int *apicid) - return -EINVAL; +{ + struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata); + unsigned long cpu_mask = cpumask_bits(mask)[0] & APIC_ALL_CPUS; + + if (!cpu_mask) + return -EINVAL; + *apicid = (unsigned int)cpu_mask; + cpumask_bits(effmsk)[0] = cpu_mask; + return 0; } /* @@ -2257,7 +2330,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) static void __init apic_bsp_up_setup(void) { #ifdef CONFIG_X86_64 - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); + apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); #else /* * Hack: In case of kdump, after a crash, kernel might be booting @@ -2647,7 +2720,7 @@ static int __init lapic_insert_resource(void) } /* - * need call insert after e820_reserve_resources() + * need call insert after e820__reserve_resources() * that is using request_resource */ late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index a4d7ff20ed22..dedd5a41ba48 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -172,7 +172,7 @@ static struct apic apic_flat __ro_after_init = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .send_IPI = default_send_IPI_single, .send_IPI_mask = flat_send_IPI_mask, @@ -268,7 +268,7 @@ static struct apic apic_physflat __ro_after_init = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index b109e4389c92..6599f437b4ab 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -26,7 +26,7 @@ #include <linux/interrupt.h> #include <asm/acpi.h> -#include <asm/e820.h> +#include <asm/e820/api.h> static void noop_init_apic_ldr(void) { } static void noop_send_IPI(int cpu, int vector) { } @@ -141,7 +141,7 @@ struct apic apic_noop __ro_after_init = { .get_apic_id = noop_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .send_IPI = noop_send_IPI, .send_IPI_mask = noop_send_IPI_mask, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index e08fe2c8dd8c..2fda912219a6 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -267,7 +267,7 @@ static const struct apic apic_numachip1 __refconst = { .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, @@ -318,7 +318,7 @@ static const struct apic apic_numachip2 __refconst = { .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 56012010332c..456e45e8bf84 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -172,7 +172,7 @@ static struct apic apic_bigsmp __ro_after_init = { .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index ae50d3454d78..56ccf9346b08 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -150,16 +150,27 @@ static const struct irq_domain_ops htirq_domain_ops = { .deactivate = htirq_domain_deactivate, }; -void arch_init_htirq_domain(struct irq_domain *parent) +void __init arch_init_htirq_domain(struct irq_domain *parent) { + struct fwnode_handle *fn; + if (disable_apic) return; - htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL); + fn = irq_domain_alloc_named_fwnode("PCI-HT"); + if (!fn) + goto warn; + + htirq_domain = irq_domain_create_tree(fn, &htirq_domain_ops, NULL); + irq_domain_free_fwnode(fn); if (!htirq_domain) - pr_warn("failed to initialize irqdomain for HTIRQ.\n"); - else - htirq_domain->parent = parent; + goto warn; + + htirq_domain->parent = parent; + return; + +warn: + pr_warn("Failed to initialize irqdomain for HTIRQ.\n"); } int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c73c9fb281e1..d6f387780849 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,7 +19,7 @@ #include <linux/init.h> #include <linux/delay.h> -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index bd6b8c270c24..237e9c2341c7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1107,12 +1107,12 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) ioapic = mp_find_ioapic(gsi); if (ioapic < 0) - return -1; + return -ENODEV; pin = mp_find_ioapic_pin(ioapic, gsi); idx = find_irq_entry(ioapic, pin, mp_INT); if ((flags & IOAPIC_MAP_CHECK) && idx < 0) - return -1; + return -ENODEV; return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info); } @@ -1200,28 +1200,6 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); static struct irq_chip ioapic_chip, ioapic_ir_chip; -#ifdef CONFIG_X86_32 -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for_each_ioapic_pin(apic, pin) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0))) - return irq_trigger(idx); - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} -#else -static inline int IO_APIC_irq_trigger(int irq) -{ - return 1; -} -#endif - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; @@ -1875,6 +1853,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_ack = irq_chip_ack_parent, .irq_eoi = ioapic_ack_level, .irq_set_affinity = ioapic_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -1886,6 +1865,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = { .irq_ack = irq_chip_ack_parent, .irq_eoi = ioapic_ir_ack_level, .irq_set_affinity = ioapic_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -2113,7 +2093,7 @@ static inline void __init check_timer(void) int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_ioapic_irq(irq_get_chip_data(0)); + unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); irq_domain_activate_irq(irq_data); @@ -2221,6 +2201,8 @@ static int mp_irqdomain_create(int ioapic) struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + struct fwnode_handle *fn; + char *name = "IO-APIC"; if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; @@ -2231,9 +2213,25 @@ static int mp_irqdomain_create(int ioapic) parent = irq_remapping_get_ir_irq_domain(&info); if (!parent) parent = x86_vector_domain; + else + name = "IO-APIC-IR"; + + /* Handle device tree enumerated APICs proper */ + if (cfg->dev) { + fn = of_node_to_fwnode(cfg->dev); + } else { + fn = irq_domain_alloc_named_id_fwnode(name, ioapic); + if (!fn) + return -ENOMEM; + } + + ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, + (void *)(long)ioapic); + + /* Release fw handle if it was allocated above */ + if (!cfg->dev) + irq_domain_free_fwnode(fn); - ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, - (void *)(long)ioapic); if (!ip->irqdomain) return -ENOMEM; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 015bbf30e3e3..9b18be764422 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -82,7 +82,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (domain == NULL) return -ENOSYS; - return pci_msi_domain_alloc_irqs(domain, dev, nvec, type); + return msi_domain_alloc_irqs(domain, &dev->dev, nvec); } void native_teardown_msi_irq(unsigned int irq) @@ -136,13 +136,20 @@ static struct msi_domain_info pci_msi_domain_info = { .handler_name = "edge", }; -void arch_init_msi_domain(struct irq_domain *parent) +void __init arch_init_msi_domain(struct irq_domain *parent) { + struct fwnode_handle *fn; + if (disable_apic) return; - msi_default_domain = pci_msi_create_irq_domain(NULL, - &pci_msi_domain_info, parent); + fn = irq_domain_alloc_named_fwnode("PCI-MSI"); + if (fn) { + msi_default_domain = + pci_msi_create_irq_domain(fn, &pci_msi_domain_info, + parent); + irq_domain_free_fwnode(fn); + } if (!msi_default_domain) pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); } @@ -167,9 +174,18 @@ static struct msi_domain_info pci_msi_ir_domain_info = { .handler_name = "edge", }; -struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) +struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, + const char *name, int id) { - return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent); + struct fwnode_handle *fn; + struct irq_domain *d; + + fn = irq_domain_alloc_named_id_fwnode(name, id); + if (!fn) + return NULL; + d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent); + irq_domain_free_fwnode(fn); + return d; } #endif @@ -221,13 +237,20 @@ static struct irq_domain *dmar_get_irq_domain(void) { static struct irq_domain *dmar_domain; static DEFINE_MUTEX(dmar_lock); + struct fwnode_handle *fn; mutex_lock(&dmar_lock); - if (dmar_domain == NULL) - dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info, + if (dmar_domain) + goto out; + + fn = irq_domain_alloc_named_fwnode("DMAR-MSI"); + if (fn) { + dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info, x86_vector_domain); + irq_domain_free_fwnode(fn); + } +out: mutex_unlock(&dmar_lock); - return dmar_domain; } @@ -317,9 +340,10 @@ static struct msi_domain_info hpet_msi_domain_info = { struct irq_domain *hpet_create_irq_domain(int hpet_id) { - struct irq_domain *parent; - struct irq_alloc_info info; struct msi_domain_info *domain_info; + struct irq_domain *parent, *d; + struct irq_alloc_info info; + struct fwnode_handle *fn; if (x86_vector_domain == NULL) return NULL; @@ -340,7 +364,16 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) else hpet_msi_controller.name = "IR-HPET-MSI"; - return msi_create_irq_domain(NULL, domain_info, parent); + fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, + hpet_id); + if (!fn) { + kfree(domain_info); + return NULL; + } + + d = msi_create_irq_domain(fn, domain_info, parent); + irq_domain_free_fwnode(fn); + return d; } int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index c48264e202fd..63287659adb6 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -25,7 +25,7 @@ #include <linux/interrupt.h> #include <asm/acpi.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) @@ -102,7 +102,7 @@ static struct apic apic_default __ro_after_init = { .get_apic_id = default_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .send_IPI = default_send_IPI_single, .send_IPI_mask = default_send_IPI_mask_logical, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 5d30c5e42bb1..b3af457ed667 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -103,7 +103,8 @@ static void free_apic_chip_data(struct apic_chip_data *data) } static int __assign_irq_vector(int irq, struct apic_chip_data *d, - const struct cpumask *mask) + const struct cpumask *mask, + struct irq_data *irqdata) { /* * NOTE! The local APIC isn't very good at handling @@ -141,7 +142,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, /* * Clear the offline cpus from @vector_cpumask for searching * and verify whether the result overlaps with @mask. If true, - * then the call to apic->cpu_mask_to_apicid_and() will + * then the call to apic->cpu_mask_to_apicid() will * succeed as well. If not, no point in trying to find a * vector in this mask. */ @@ -221,34 +222,40 @@ success: * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail * as we already established, that mask & d->domain & cpu_online_mask * is not empty. + * + * vector_searchmask is a subset of d->domain and has the offline + * cpus masked out. */ - BUG_ON(apic->cpu_mask_to_apicid_and(mask, d->domain, - &d->cfg.dest_apicid)); + cpumask_and(vector_searchmask, vector_searchmask, mask); + BUG_ON(apic->cpu_mask_to_apicid(vector_searchmask, irqdata, + &d->cfg.dest_apicid)); return 0; } static int assign_irq_vector(int irq, struct apic_chip_data *data, - const struct cpumask *mask) + const struct cpumask *mask, + struct irq_data *irqdata) { int err; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, data, mask); + err = __assign_irq_vector(irq, data, mask, irqdata); raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } static int assign_irq_vector_policy(int irq, int node, struct apic_chip_data *data, - struct irq_alloc_info *info) + struct irq_alloc_info *info, + struct irq_data *irqdata) { if (info && info->mask) - return assign_irq_vector(irq, data, info->mask); + return assign_irq_vector(irq, data, info->mask, irqdata); if (node != NUMA_NO_NODE && - assign_irq_vector(irq, data, cpumask_of_node(node)) == 0) + assign_irq_vector(irq, data, cpumask_of_node(node), irqdata) == 0) return 0; - return assign_irq_vector(irq, data, apic->target_cpus()); + return assign_irq_vector(irq, data, apic->target_cpus(), irqdata); } static void clear_irq_vector(int irq, struct apic_chip_data *data) @@ -360,9 +367,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data->chip = &lapic_controller; irq_data->chip_data = data; irq_data->hwirq = virq + i; - err = assign_irq_vector_policy(virq + i, node, data, info); + err = assign_irq_vector_policy(virq + i, node, data, info, + irq_data); if (err) goto error; + /* + * If the apic destination mode is physical, then the + * effective affinity is restricted to a single target + * CPU. Mark the interrupt accordingly. + */ + if (!apic->irq_dest_mode) + irqd_set_single_target(irq_data); } return 0; @@ -405,7 +420,7 @@ int __init arch_probe_nr_irqs(void) } #ifdef CONFIG_X86_IO_APIC -static void init_legacy_irqs(void) +static void __init init_legacy_irqs(void) { int i, node = cpu_to_node(0); struct apic_chip_data *data; @@ -424,16 +439,21 @@ static void init_legacy_irqs(void) } } #else -static void init_legacy_irqs(void) { } +static inline void init_legacy_irqs(void) { } #endif int __init arch_early_irq_init(void) { + struct fwnode_handle *fn; + init_legacy_irqs(); - x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops, - NULL); + fn = irq_domain_alloc_named_fwnode("VECTOR"); + BUG_ON(!fn); + x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, + NULL); BUG_ON(x86_vector_domain == NULL); + irq_domain_free_fwnode(fn); irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); @@ -529,11 +549,12 @@ static int apic_set_affinity(struct irq_data *irq_data, if (!cpumask_intersects(dest, cpu_online_mask)) return -EINVAL; - err = assign_irq_vector(irq, data, dest); + err = assign_irq_vector(irq, data, dest, irq_data); return err ? err : IRQ_SET_MASK_OK; } static struct irq_chip lapic_controller = { + .name = "APIC", .irq_ack = apic_ack_edge, .irq_set_affinity = apic_set_affinity, .irq_retrigger = apic_retrigger_irq, @@ -559,7 +580,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) __send_cleanup_vector(data); } -asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) +asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 5a35f208ed95..481237cb1544 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -4,6 +4,7 @@ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/dmar.h> +#include <linux/irq.h> #include <linux/cpu.h> #include <asm/smp.h> @@ -104,35 +105,30 @@ static void x2apic_send_IPI_all(int vector) } static int -x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid) +x2apic_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, + unsigned int *apicid) { + struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata); + unsigned int cpu; u32 dest = 0; u16 cluster; - int i; - - for_each_cpu_and(i, cpumask, andmask) { - if (!cpumask_test_cpu(i, cpu_online_mask)) - continue; - dest = per_cpu(x86_cpu_to_logical_apicid, i); - cluster = x2apic_cluster(i); - break; - } - if (!dest) + cpu = cpumask_first(mask); + if (cpu >= nr_cpu_ids) return -EINVAL; - for_each_cpu_and(i, cpumask, andmask) { - if (!cpumask_test_cpu(i, cpu_online_mask)) - continue; - if (cluster != x2apic_cluster(i)) + dest = per_cpu(x86_cpu_to_logical_apicid, cpu); + cluster = x2apic_cluster(cpu); + + cpumask_clear(effmsk); + for_each_cpu(cpu, mask) { + if (cluster != x2apic_cluster(cpu)) continue; - dest |= per_cpu(x86_cpu_to_logical_apicid, i); + dest |= per_cpu(x86_cpu_to_logical_apicid, cpu); + cpumask_set_cpu(cpu, effmsk); } *apicid = dest; - return 0; } @@ -256,7 +252,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index ff111f05a314..3baf0c3dc875 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -127,7 +127,7 @@ static struct apic apic_x2apic_phys __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 35690a168cf7..0d57bb9079c9 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -34,6 +34,7 @@ #include <asm/uv/bios.h> #include <asm/uv/uv.h> #include <asm/apic.h> +#include <asm/e820/api.h> #include <asm/ipi.h> #include <asm/smp.h> #include <asm/x86_init.h> @@ -41,40 +42,44 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); -#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args) +static enum uv_system_type uv_system_type; +static bool uv_hubless_system; +static u64 gru_start_paddr, gru_end_paddr; +static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; +static u64 gru_dist_lmask, gru_dist_umask; +static union uvh_apicid uvh_apicid; -static enum uv_system_type uv_system_type; -static u64 gru_start_paddr, gru_end_paddr; -static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; -static u64 gru_dist_lmask, gru_dist_umask; -static union uvh_apicid uvh_apicid; - -/* info derived from CPUID */ +/* Information derived from CPUID: */ static struct { unsigned int apicid_shift; unsigned int apicid_mask; unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */ unsigned int pnode_mask; unsigned int gpa_shift; + unsigned int gnode_shift; } uv_cpuid; int uv_min_hub_revision_id; EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); + unsigned int uv_apicid_hibits; EXPORT_SYMBOL_GPL(uv_apicid_hibits); static struct apic apic_x2apic_uv_x; static struct uv_hub_info_s uv_hub_info_node0; -/* Set this to use hardware error handler instead of kernel panic */ +/* Set this to use hardware error handler instead of kernel panic: */ static int disable_uv_undefined_panic = 1; + unsigned long uv_undefined(char *str) { if (likely(!disable_uv_undefined_panic)) panic("UV: error: undefined MMR: %s\n", str); else pr_crit("UV: error: undefined MMR: %s\n", str); - return ~0ul; /* cause a machine fault */ + + /* Cause a machine fault: */ + return ~0ul; } EXPORT_SYMBOL(uv_undefined); @@ -85,18 +90,19 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr) mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr)); val = *mmr; early_iounmap(mmr, sizeof(*mmr)); + return val; } static inline bool is_GRU_range(u64 start, u64 end) { if (gru_dist_base) { - u64 su = start & gru_dist_umask; /* upper (incl pnode) bits */ - u64 sl = start & gru_dist_lmask; /* base offset bits */ + u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */ + u64 sl = start & gru_dist_lmask; /* Base offset bits */ u64 eu = end & gru_dist_umask; u64 el = end & gru_dist_lmask; - /* Must reside completely within a single GRU range */ + /* Must reside completely within a single GRU range: */ return (sl == gru_dist_base && el == gru_dist_base && su >= gru_first_node_paddr && su <= gru_last_node_paddr && @@ -133,13 +139,14 @@ static int __init early_get_pnodeid(void) break; case UV4_HUB_PART_NUMBER: uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1; + uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ break; } uv_hub_info->hub_revision = uv_min_hub_revision_id; uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1; pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask; - uv_cpuid.gpa_shift = 46; /* default unless changed */ + uv_cpuid.gpa_shift = 46; /* Default unless changed */ pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n", node_id.s.revision, node_id.s.part_number, node_id.s.node_id, @@ -147,11 +154,12 @@ static int __init early_get_pnodeid(void) return pnode; } -/* [copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ -#define SMT_LEVEL 0 /* leaf 0xb SMT level */ -#define INVALID_TYPE 0 /* leaf 0xb sub-leaf types */ -#define SMT_TYPE 1 -#define CORE_TYPE 2 +/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ + +#define SMT_LEVEL 0 /* Leaf 0xb SMT level */ +#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */ +#define SMT_TYPE 1 +#define CORE_TYPE 2 #define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) @@ -165,11 +173,13 @@ static void set_x2apic_bits(void) pr_info("UV: CPU does not have CPUID.11\n"); return; } + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) { pr_info("UV: CPUID.11 not implemented\n"); return; } + sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; do { @@ -180,8 +190,9 @@ static void set_x2apic_bits(void) } sub_index++; } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); - uv_cpuid.apicid_shift = 0; - uv_cpuid.apicid_mask = (~(-1 << sid_shift)); + + uv_cpuid.apicid_shift = 0; + uv_cpuid.apicid_mask = (~(-1 << sid_shift)); uv_cpuid.socketid_shift = sid_shift; } @@ -192,10 +203,8 @@ static void __init early_get_apic_socketid_shift(void) set_x2apic_bits(); - pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", - uv_cpuid.apicid_shift, uv_cpuid.apicid_mask); - pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", - uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); + pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask); + pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); } /* @@ -208,10 +217,8 @@ static void __init uv_set_apicid_hibit(void) union uv1h_lb_target_physical_apic_id_mask_u apicid_mask; if (is_uv1_hub()) { - apicid_mask.v = - uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK); - uv_apicid_hibits = - apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK; + apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK); + uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK; } } @@ -220,20 +227,26 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) int pnodeid; int uv_apic; - if (strncmp(oem_id, "SGI", 3) != 0) + if (strncmp(oem_id, "SGI", 3) != 0) { + if (strncmp(oem_id, "NSGI", 4) == 0) { + uv_hubless_system = true; + pr_info("UV: OEM IDs %s/%s, HUBLESS\n", + oem_id, oem_table_id); + } return 0; + } if (numa_off) { pr_err("UV: NUMA is off, disabling UV support\n"); return 0; } - /* Setup early hub type field in uv_hub_info for Node 0 */ + /* Set up early hub type field in uv_hub_info for Node 0 */ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; /* * Determine UV arch type. - * SGI: UV100/1000 + * SGI: UV100/1000 * SGI2: UV2000/3000 * SGI3: UV300 (truncated to 4 chars because of different varieties) * SGI4: UV400 (truncated to 4 chars because of different varieties) @@ -249,31 +262,32 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) pnodeid = early_get_pnodeid(); early_get_apic_socketid_shift(); - x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + + x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; - if (!strcmp(oem_table_id, "UVX")) { /* most common */ + if (!strcmp(oem_table_id, "UVX")) { + /* This is the most common hardware variant: */ uv_system_type = UV_X2APIC; uv_apic = 0; - } else if (!strcmp(oem_table_id, "UVH")) { /* only UV1 systems */ + } else if (!strcmp(oem_table_id, "UVH")) { + /* Only UV1 systems: */ uv_system_type = UV_NON_UNIQUE_APIC; - __this_cpu_write(x2apic_extra_bits, - pnodeid << uvh_apicid.s.pnode_shift); + __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift); uv_set_apicid_hibit(); uv_apic = 1; - } else if (!strcmp(oem_table_id, "UVL")) { /* only used for */ - uv_system_type = UV_LEGACY_APIC; /* very small systems */ + } else if (!strcmp(oem_table_id, "UVL")) { + /* Only used for very small systems: */ + uv_system_type = UV_LEGACY_APIC; uv_apic = 0; } else { goto badbios; } - pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", - oem_id, oem_table_id, uv_system_type, - uv_min_hub_revision_id, uv_apic); + pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic); return uv_apic; @@ -294,6 +308,12 @@ int is_uv_system(void) } EXPORT_SYMBOL_GPL(is_uv_system); +int is_uv_hubless(void) +{ + return uv_hubless_system; +} +EXPORT_SYMBOL_GPL(is_uv_hubless); + void **__uv_hub_info_list; EXPORT_SYMBOL_GPL(__uv_hub_info_list); @@ -306,16 +326,18 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -/* the following values are used for the per node hub info struct */ -static __initdata unsigned short *_node_to_pnode; -static __initdata unsigned short _min_socket, _max_socket; -static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len; -static __initdata struct uv_gam_range_entry *uv_gre_table; -static __initdata struct uv_gam_parameters *uv_gp_table; -static __initdata unsigned short *_socket_to_node; -static __initdata unsigned short *_socket_to_pnode; -static __initdata unsigned short *_pnode_to_socket; -static __initdata struct uv_gam_range_s *_gr_table; +/* The following values are used for the per node hub info struct */ +static __initdata unsigned short *_node_to_pnode; +static __initdata unsigned short _min_socket, _max_socket; +static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len; +static __initdata struct uv_gam_range_entry *uv_gre_table; +static __initdata struct uv_gam_parameters *uv_gp_table; +static __initdata unsigned short *_socket_to_node; +static __initdata unsigned short *_socket_to_pnode; +static __initdata unsigned short *_pnode_to_socket; + +static __initdata struct uv_gam_range_s *_gr_table; + #define SOCK_EMPTY ((unsigned short)~0) extern int uv_hub_info_version(void) @@ -324,7 +346,7 @@ extern int uv_hub_info_version(void) } EXPORT_SYMBOL(uv_hub_info_version); -/* Build GAM range lookup table */ +/* Build GAM range lookup table: */ static __init void build_uv_gr_table(void) { struct uv_gam_range_entry *gre = uv_gre_table; @@ -342,25 +364,24 @@ static __init void build_uv_gr_table(void) for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) { - if (!ram_limit) { /* mark hole between ram/non-ram */ + if (!ram_limit) { + /* Mark hole between RAM/non-RAM: */ ram_limit = last_limit; last_limit = gre->limit; lsid++; continue; } last_limit = gre->limit; - pr_info("UV: extra hole in GAM RE table @%d\n", - (int)(gre - uv_gre_table)); + pr_info("UV: extra hole in GAM RE table @%d\n", (int)(gre - uv_gre_table)); continue; } if (_max_socket < gre->sockid) { - pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", - gre->sockid, _max_socket, - (int)(gre - uv_gre_table)); + pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", gre->sockid, _max_socket, (int)(gre - uv_gre_table)); continue; } sid = gre->sockid - _min_socket; - if (lsid < sid) { /* new range */ + if (lsid < sid) { + /* New range: */ grt = &_gr_table[indx]; grt->base = lindx; grt->nasid = gre->nasid; @@ -369,27 +390,32 @@ static __init void build_uv_gr_table(void) lindx = indx++; continue; } - if (lsid == sid && !ram_limit) { /* update range */ - if (grt->limit == last_limit) { /* .. if contiguous */ + /* Update range: */ + if (lsid == sid && !ram_limit) { + /* .. if contiguous: */ + if (grt->limit == last_limit) { grt->limit = last_limit = gre->limit; continue; } } - if (!ram_limit) { /* non-contiguous ram range */ + /* Non-contiguous RAM range: */ + if (!ram_limit) { grt++; grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; continue; } - grt++; /* non-contiguous/non-ram */ - grt->base = grt - _gr_table; /* base is this entry */ + /* Non-contiguous/non-RAM: */ + grt++; + /* base is this entry */ + grt->base = grt - _gr_table; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; lsid++; } - /* shorten table if possible */ + /* Shorten table if possible */ grt++; i = grt - _gr_table; if (i < _gr_table_len) { @@ -403,16 +429,15 @@ static __init void build_uv_gr_table(void) } } - /* display resultant gam range table */ + /* Display resultant GAM range table: */ for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) { + unsigned long start, end; int gb = grt->base; - unsigned long start = gb < 0 ? 0 : - (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT; - unsigned long end = - (unsigned long)grt->limit << UV_GAM_RANGE_SHFT; - pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", - i, grt->nasid, start, end, gb); + start = gb < 0 ? 0 : (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT; + end = (unsigned long)grt->limit << UV_GAM_RANGE_SHFT; + + pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", i, grt->nasid, start, end, gb); } } @@ -423,16 +448,19 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) pnode = uv_apicid_to_pnode(phys_apicid); phys_apicid |= uv_apicid_hibits; + val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_INIT; + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_STARTUP; + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); return 0; @@ -498,27 +526,15 @@ static void uv_init_apic_ldr(void) } static int -uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid) +uv_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, + unsigned int *apicid) { - int unsigned cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } + int ret = default_cpu_mask_to_apicid(mask, irqdata, apicid); - if (likely(cpu < nr_cpu_ids)) { - *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; - return 0; - } + if (!ret) + *apicid |= uv_apicid_hibits; - return -EINVAL; + return ret; } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -566,7 +582,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .irq_dest_mode = 0, /* Physical */ .target_cpus = online_target_cpus, .disable_esr = 0, @@ -586,7 +602,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, .send_IPI = uv_send_IPI_one, .send_IPI_mask = uv_send_IPI_mask, @@ -627,23 +643,22 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) switch (i) { case 0: m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR; break; case 1: m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR; break; case 2: m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR; break; } alias.v = uv_read_local_mmr(m_overlay); if (alias.s.enable && alias.s.base == 0) { *size = (1UL << alias.s.m_alias); redirect.v = uv_read_local_mmr(m_redirect); - *base = (unsigned long)redirect.s.dest_base - << DEST_SHIFT; + *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; return; } } @@ -652,8 +667,7 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) enum map_type {map_wb, map_uc}; -static __init void map_high(char *id, unsigned long base, int pshift, - int bshift, int max_pnode, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; @@ -678,16 +692,19 @@ static __init void map_gru_distributed(unsigned long c) int nid; gru.v = c; - /* only base bits 42:28 relevant in dist mode */ + + /* Only base bits 42:28 relevant in dist mode */ gru_dist_base = gru.v & 0x000007fff0000000UL; if (!gru_dist_base) { pr_info("UV: Map GRU_DIST base address NULL\n"); return; } + bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1); gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1); gru_dist_base &= gru_dist_lmask; /* Clear bits above M */ + for_each_online_node(nid) { paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) | gru_dist_base; @@ -695,11 +712,12 @@ static __init void map_gru_distributed(unsigned long c) gru_first_node_paddr = min(paddr, gru_first_node_paddr); gru_last_node_paddr = max(paddr, gru_last_node_paddr); } + /* Save upper (63:M) bits of address only for is_GRU_range */ gru_first_node_paddr &= gru_dist_umask; gru_last_node_paddr &= gru_dist_umask; - pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", - gru_dist_base, gru_first_node_paddr, gru_last_node_paddr); + + pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr); } static __init void map_gru_high(int max_pnode) @@ -719,6 +737,7 @@ static __init void map_gru_high(int max_pnode) map_gru_distributed(gru.v); return; } + base = (gru.v & mask) >> shift; map_high("GRU", base, shift, shift, max_pnode, map_wb); gru_start_paddr = ((u64)base << shift); @@ -772,8 +791,8 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) id = mmiohs[index].id; overlay.v = uv_read_local_mmr(mmiohs[index].overlay); - pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", - id, overlay.v, overlay.s3.base, overlay.s3.m_io); + + pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", id, overlay.v, overlay.s3.base, overlay.s3.m_io); if (!overlay.s3.enable) { pr_info("UV: %s disabled\n", id); return; @@ -784,7 +803,8 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) m_io = overlay.s3.m_io; mmr = mmiohs[index].redirect; n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; - min_pnode *= 2; /* convert to NASID */ + /* Convert to NASID: */ + min_pnode *= 2; max_pnode *= 2; max_io = lnasid = fi = li = -1; @@ -793,16 +813,18 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) redirect.v = uv_read_local_mmr(mmr + i * 8); nasid = redirect.s3.nasid; + /* Invalid NASID: */ if (nasid < min_pnode || max_pnode < nasid) - nasid = -1; /* invalid NASID */ + nasid = -1; if (nasid == lnasid) { li = i; - if (i != n-1) /* last entry check */ + /* Last entry check: */ + if (i != n-1) continue; } - /* check if we have a cached (or last) redirect to print */ + /* Check if we have a cached (or last) redirect to print: */ if (lnasid != -1 || (i == n-1 && nasid != -1)) { unsigned long addr1, addr2; int f, l; @@ -814,12 +836,9 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) f = fi; l = li; } - addr1 = (base << shift) + - f * (1ULL << m_io); - addr2 = (base << shift) + - (l + 1) * (1ULL << m_io); - pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", - id, fi, li, lnasid, addr1, addr2); + addr1 = (base << shift) + f * (1ULL << m_io); + addr2 = (base << shift) + (l + 1) * (1ULL << m_io); + pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2); if (max_io < l) max_io = l; } @@ -827,8 +846,7 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) lnasid = nasid; } - pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", - id, base, shift, m_io, max_io); + pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", id, base, shift, m_io, max_io); if (max_io >= 0) map_high(id, base, shift, m_io, max_io, map_uc); @@ -841,36 +859,35 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode) int shift, enable, m_io, n_io; if (is_uv3_hub() || is_uv4_hub()) { - /* Map both MMIOH Regions */ + /* Map both MMIOH regions: */ map_mmioh_high_uv3(0, min_pnode, max_pnode); map_mmioh_high_uv3(1, min_pnode, max_pnode); return; } if (is_uv1_hub()) { - mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; - shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - mmioh.v = uv_read_local_mmr(mmr); - enable = !!mmioh.s1.enable; - base = mmioh.s1.base; - m_io = mmioh.s1.m_io; - n_io = mmioh.s1.n_io; + mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; + shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + mmioh.v = uv_read_local_mmr(mmr); + enable = !!mmioh.s1.enable; + base = mmioh.s1.base; + m_io = mmioh.s1.m_io; + n_io = mmioh.s1.n_io; } else if (is_uv2_hub()) { - mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; - shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - mmioh.v = uv_read_local_mmr(mmr); - enable = !!mmioh.s2.enable; - base = mmioh.s2.base; - m_io = mmioh.s2.m_io; - n_io = mmioh.s2.n_io; - } else + mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; + shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + mmioh.v = uv_read_local_mmr(mmr); + enable = !!mmioh.s2.enable; + base = mmioh.s2.base; + m_io = mmioh.s2.m_io; + n_io = mmioh.s2.n_io; + } else { return; + } if (enable) { max_pnode &= (1 << n_io) - 1; - pr_info( - "UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", - base, shift, m_io, n_io, max_pnode); + pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode); map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); } else { pr_info("UV: MMIOH disabled\n"); @@ -888,16 +905,16 @@ static __init void uv_rtc_init(void) long status; u64 ticks_per_sec; - status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, - &ticks_per_sec); + status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec); + if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { - printk(KERN_WARNING - "unable to determine platform RTC clock frequency, " - "guessing.\n"); - /* BIOS gives wrong value for clock freq. so guess */ + pr_warn("UV: unable to determine platform RTC clock frequency, guessing.\n"); + + /* BIOS gives wrong value for clock frequency, so guess: */ sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; - } else + } else { sn_rtc_cycles_per_second = ticks_per_sec; + } } /* @@ -908,19 +925,19 @@ static void uv_heartbeat(unsigned long ignored) struct timer_list *timer = &uv_scir_info->timer; unsigned char bits = uv_scir_info->state; - /* flip heartbeat bit */ + /* Flip heartbeat bit: */ bits ^= SCIR_CPU_HEARTBEAT; - /* is this cpu idle? */ + /* Is this CPU idle? */ if (idle_cpu(raw_smp_processor_id())) bits &= ~SCIR_CPU_ACTIVITY; else bits |= SCIR_CPU_ACTIVITY; - /* update system controller interface reg */ + /* Update system controller interface reg: */ uv_set_scir_bits(bits); - /* enable next timer period */ + /* Enable next timer period: */ mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); } @@ -935,7 +952,7 @@ static int uv_heartbeat_enable(unsigned int cpu) add_timer_on(timer, cpu); uv_cpu_scir_info(cpu)->enabled = 1; - /* also ensure that boot cpu is enabled */ + /* Also ensure that boot CPU is enabled: */ cpu = 0; } return 0; @@ -968,9 +985,11 @@ static __init int uv_init_heartbeat(void) { int cpu; - if (is_uv_system()) + if (is_uv_system()) { for_each_online_cpu(cpu) uv_heartbeat_enable(cpu); + } + return 0; } @@ -979,14 +998,10 @@ late_initcall(uv_init_heartbeat); #endif /* !CONFIG_HOTPLUG_CPU */ /* Direct Legacy VGA I/O traffic to designated IOH */ -int uv_set_vga_state(struct pci_dev *pdev, bool decode, - unsigned int command_bits, u32 flags) +int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags) { int domain, bus, rc; - PR_DEVEL("devfn %x decode %d cmd %x flags %d\n", - pdev->devfn, decode, command_bits, flags); - if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE)) return 0; @@ -997,13 +1012,12 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode, bus = pdev->bus->number; rc = uv_bios_set_legacy_vga_target(decode, domain, bus); - PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc); return rc; } /* - * Called on each cpu to initialize the per_cpu UV data area. + * Called on each CPU to initialize the per_cpu UV data area. * FIXME: hotplug not supported yet */ void uv_cpu_init(void) @@ -1030,90 +1044,80 @@ static void get_mn(struct mn *mnp) union uvh_rh_gam_config_mmr_u m_n_config; union uv3h_gr0_gam_gr_config_u m_gr_config; - m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR); - mnp->n_val = m_n_config.s.n_skt; + /* Make sure the whole structure is well initialized: */ + memset(mnp, 0, sizeof(*mnp)); + + m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR); + mnp->n_val = m_n_config.s.n_skt; + if (is_uv4_hub()) { - mnp->m_val = 0; - mnp->n_lshift = 0; + mnp->m_val = 0; + mnp->n_lshift = 0; } else if (is_uv3_hub()) { - mnp->m_val = m_n_config.s3.m_skt; - m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); - mnp->n_lshift = m_gr_config.s3.m_skt; + mnp->m_val = m_n_config.s3.m_skt; + m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + mnp->n_lshift = m_gr_config.s3.m_skt; } else if (is_uv2_hub()) { - mnp->m_val = m_n_config.s2.m_skt; - mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; + mnp->m_val = m_n_config.s2.m_skt; + mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; } else if (is_uv1_hub()) { - mnp->m_val = m_n_config.s1.m_skt; - mnp->n_lshift = mnp->m_val; + mnp->m_val = m_n_config.s1.m_skt; + mnp->n_lshift = mnp->m_val; } mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0; } -void __init uv_init_hub_info(struct uv_hub_info_s *hub_info) +void __init uv_init_hub_info(struct uv_hub_info_s *hi) { - struct mn mn = {0}; /* avoid unitialized warnings */ union uvh_node_id_u node_id; + struct mn mn; get_mn(&mn); - hub_info->m_val = mn.m_val; - hub_info->n_val = mn.n_val; - hub_info->m_shift = mn.m_shift; - hub_info->n_lshift = mn.n_lshift ? mn.n_lshift : 0; - - hub_info->hub_revision = uv_hub_info->hub_revision; - hub_info->pnode_mask = uv_cpuid.pnode_mask; - hub_info->min_pnode = _min_pnode; - hub_info->min_socket = _min_socket; - hub_info->pnode_to_socket = _pnode_to_socket; - hub_info->socket_to_node = _socket_to_node; - hub_info->socket_to_pnode = _socket_to_pnode; - hub_info->gr_table_len = _gr_table_len; - hub_info->gr_table = _gr_table; - hub_info->gpa_mask = mn.m_val ? + hi->gpa_mask = mn.m_val ? (1UL << (mn.m_val + mn.n_val)) - 1 : (1UL << uv_cpuid.gpa_shift) - 1; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); - hub_info->gnode_extra = - (node_id.s.node_id & ~((1 << mn.n_val) - 1)) >> 1; - - hub_info->gnode_upper = - ((unsigned long)hub_info->gnode_extra << mn.m_val); + hi->m_val = mn.m_val; + hi->n_val = mn.n_val; + hi->m_shift = mn.m_shift; + hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0; + hi->hub_revision = uv_hub_info->hub_revision; + hi->pnode_mask = uv_cpuid.pnode_mask; + hi->min_pnode = _min_pnode; + hi->min_socket = _min_socket; + hi->pnode_to_socket = _pnode_to_socket; + hi->socket_to_node = _socket_to_node; + hi->socket_to_pnode = _socket_to_pnode; + hi->gr_table_len = _gr_table_len; + hi->gr_table = _gr_table; + + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val); + hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; + if (mn.m_val) + hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val; if (uv_gp_table) { - hub_info->global_mmr_base = uv_gp_table->mmr_base; - hub_info->global_mmr_shift = uv_gp_table->mmr_shift; - hub_info->global_gru_base = uv_gp_table->gru_base; - hub_info->global_gru_shift = uv_gp_table->gru_shift; - hub_info->gpa_shift = uv_gp_table->gpa_shift; - hub_info->gpa_mask = (1UL << hub_info->gpa_shift) - 1; + hi->global_mmr_base = uv_gp_table->mmr_base; + hi->global_mmr_shift = uv_gp_table->mmr_shift; + hi->global_gru_base = uv_gp_table->gru_base; + hi->global_gru_shift = uv_gp_table->gru_shift; + hi->gpa_shift = uv_gp_table->gpa_shift; + hi->gpa_mask = (1UL << hi->gpa_shift) - 1; } else { - hub_info->global_mmr_base = - uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & - ~UV_MMR_ENABLE; - hub_info->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT; + hi->global_mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT; } - get_lowmem_redirect( - &hub_info->lowmem_remap_base, &hub_info->lowmem_remap_top); - - hub_info->apic_pnode_shift = uv_cpuid.socketid_shift; + get_lowmem_redirect(&hi->lowmem_remap_base, &hi->lowmem_remap_top); - /* show system specific info */ - pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", - hub_info->n_val, hub_info->m_val, - hub_info->m_shift, hub_info->n_lshift); + hi->apic_pnode_shift = uv_cpuid.socketid_shift; - pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", - hub_info->gpa_mask, hub_info->gpa_shift, - hub_info->pnode_mask, hub_info->apic_pnode_shift); - - pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", - hub_info->global_mmr_base, hub_info->global_mmr_shift, - hub_info->global_gru_base, hub_info->global_gru_shift); - - pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", - hub_info->gnode_upper, hub_info->gnode_extra); + /* Show system specific info: */ + pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift); + pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift); + pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift, hi->global_gru_base, hi->global_gru_shift); + pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra); } static void __init decode_gam_params(unsigned long ptr) @@ -1139,12 +1143,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (!index) { pr_info("UV: GAM Range Table...\n"); - pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", - "Range", "", "Size", "Type", "NASID", - "SID", "PN"); + pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN"); } - pr_info( - "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", + pr_info("UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, @@ -1162,29 +1163,32 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) if (pnode_max < gre->pnode) pnode_max = gre->pnode; } - _min_socket = sock_min; - _max_socket = sock_max; - _min_pnode = pnode_min; - _max_pnode = pnode_max; - _gr_table_len = index; - pr_info( - "UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", - index, _min_socket, _max_socket, _min_pnode, _max_pnode); + _min_socket = sock_min; + _max_socket = sock_max; + _min_pnode = pnode_min; + _max_pnode = pnode_max; + _gr_table_len = index; + + pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode); } -static void __init decode_uv_systab(void) +static int __init decode_uv_systab(void) { struct uv_systab *st; int i; + if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE) + return 0; /* No extended UVsystab required */ + st = uv_systab; - if ((!st || st->revision < UV_SYSTAB_VERSION_UV4) && !is_uv4_hub()) - return; - if (st->revision != UV_SYSTAB_VERSION_UV4_LATEST) { - pr_crit( - "UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", - st->revision, UV_SYSTAB_VERSION_UV4_LATEST); - BUG(); + if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) { + int rev = st ? st->revision : 0; + + pr_err("UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", rev, UV_SYSTAB_VERSION_UV4_LATEST); + pr_err("UV: Cannot support UV operations, switching to generic PC\n"); + uv_system_type = UV_NONE; + + return -EINVAL; } for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { @@ -1205,10 +1209,11 @@ static void __init decode_uv_systab(void) break; } } + return 0; } /* - * Setup physical blade translations from UVH_NODE_PRESENT_TABLE + * Set up physical blade translations from UVH_NODE_PRESENT_TABLE * .. NB: UVH_NODE_PRESENT_TABLE is going away, * .. being replaced by GAM Range Table */ @@ -1244,14 +1249,13 @@ static void __init build_socket_tables(void) if (!gre) { if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) { pr_info("UV: No UVsystab socket table, ignoring\n"); - return; /* not required */ + return; } - pr_crit( - "UV: Error: UVsystab address translations not available!\n"); + pr_crit("UV: Error: UVsystab address translations not available!\n"); BUG(); } - /* build socket id -> node id, pnode */ + /* Build socket id -> node id, pnode */ num = maxsock - minsock + 1; bytes = num * sizeof(_socket_to_node[0]); _socket_to_node = kmalloc(bytes, GFP_KERNEL); @@ -1268,27 +1272,27 @@ static void __init build_socket_tables(void) for (i = 0; i < nump; i++) _pnode_to_socket[i] = SOCK_EMPTY; - /* fill in pnode/node/addr conversion list values */ + /* Fill in pnode/node/addr conversion list values: */ pr_info("UV: GAM Building socket/pnode conversion tables\n"); for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) continue; i = gre->sockid - minsock; + /* Duplicate: */ if (_socket_to_pnode[i] != SOCK_EMPTY) - continue; /* duplicate */ + continue; _socket_to_pnode[i] = gre->pnode; i = gre->pnode - minpnode; _pnode_to_socket[i] = gre->sockid; - pr_info( - "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", + pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", gre->sockid, gre->type, gre->nasid, _socket_to_pnode[gre->sockid - minsock], _pnode_to_socket[gre->pnode - minpnode]); } - /* Set socket -> node values */ + /* Set socket -> node values: */ lnid = -1; for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); @@ -1304,7 +1308,7 @@ static void __init build_socket_tables(void) sockid, apicid, nid); } - /* Setup physical blade to pnode translation from GAM Range Table */ + /* Set up physical blade to pnode translation from GAM Range Table: */ bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]); _node_to_pnode = kmalloc(bytes, GFP_KERNEL); BUG_ON(!_node_to_pnode); @@ -1314,8 +1318,7 @@ static void __init build_socket_tables(void) for (sockid = minsock; sockid <= maxsock; sockid++) { if (lnid == _socket_to_node[sockid - minsock]) { - _node_to_pnode[lnid] = - _socket_to_pnode[sockid - minsock]; + _node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock]; break; } } @@ -1332,8 +1335,7 @@ static void __init build_socket_tables(void) pr_info("UV: Checking socket->node/pnode for identity maps\n"); if (minsock == 0) { for (i = 0; i < num; i++) - if (_socket_to_node[i] == SOCK_EMPTY || - i != _socket_to_node[i]) + if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i]) break; if (i >= num) { kfree(_socket_to_node); @@ -1354,7 +1356,7 @@ static void __init build_socket_tables(void) } } -void __init uv_system_init(void) +static void __init uv_system_init_hub(void) { struct uv_hub_info_s hub_info = {0}; int bytes, cpu, nodeid; @@ -1372,8 +1374,13 @@ void __init uv_system_init(void) map_low_mmrs(); - uv_bios_init(); /* get uv_systab for decoding */ - decode_uv_systab(); + /* Get uv_systab for decoding: */ + uv_bios_init(); + + /* If there's an UVsystab problem then abort UV init: */ + if (decode_uv_systab() < 0) + return; + build_socket_tables(); build_uv_gr_table(); uv_init_hub_info(&hub_info); @@ -1381,14 +1388,10 @@ void __init uv_system_init(void) if (!_node_to_pnode) boot_init_possible_blades(&hub_info); - /* uv_num_possible_blades() is really the hub count */ - pr_info("UV: Found %d hubs, %d nodes, %d cpus\n", - uv_num_possible_blades(), - num_possible_nodes(), - num_possible_cpus()); + /* uv_num_possible_blades() is really the hub count: */ + pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus()); - uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, - &sn_region_size, &system_serial_number); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number); hub_info.coherency_domain_number = sn_coherency_id; uv_rtc_init(); @@ -1401,33 +1404,31 @@ void __init uv_system_init(void) struct uv_hub_info_s *new_hub; if (__uv_hub_info_list[nodeid]) { - pr_err("UV: Node %d UV HUB already initialized!?\n", - nodeid); + pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid); BUG(); } /* Allocate new per hub info list */ - new_hub = (nodeid == 0) ? - &uv_hub_info_node0 : - kzalloc_node(bytes, GFP_KERNEL, nodeid); + new_hub = (nodeid == 0) ? &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid); BUG_ON(!new_hub); __uv_hub_info_list[nodeid] = new_hub; new_hub = uv_hub_info_list(nodeid); BUG_ON(!new_hub); *new_hub = hub_info; - /* Use information from GAM table if available */ + /* Use information from GAM table if available: */ if (_node_to_pnode) new_hub->pnode = _node_to_pnode[nodeid]; - else /* Fill in during cpu loop */ + else /* Or fill in during CPU loop: */ new_hub->pnode = 0xffff; + new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); new_hub->memory_nid = -1; new_hub->nr_possible_cpus = 0; new_hub->nr_online_cpus = 0; } - /* Initialize per cpu info */ + /* Initialize per CPU info: */ for_each_possible_cpu(cpu) { int apicid = per_cpu(x86_cpu_to_apicid, cpu); int numa_node_id; @@ -1438,22 +1439,24 @@ void __init uv_system_init(void) pnode = uv_apicid_to_pnode(apicid); uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); - uv_cpu_info_per(cpu)->blade_cpu_id = - uv_cpu_hub_info(cpu)->nr_possible_cpus++; + uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; if (uv_cpu_hub_info(cpu)->memory_nid == -1) uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); - if (nodeid != numa_node_id && /* init memoryless node */ + + /* Init memoryless node: */ + if (nodeid != numa_node_id && uv_hub_info_list(numa_node_id)->pnode == 0xffff) uv_hub_info_list(numa_node_id)->pnode = pnode; else if (uv_cpu_hub_info(cpu)->pnode == 0xffff) uv_cpu_hub_info(cpu)->pnode = pnode; + uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid); } for_each_node(nodeid) { unsigned short pnode = uv_hub_info_list(nodeid)->pnode; - /* Add pnode info for pre-GAM list nodes without cpus */ + /* Add pnode info for pre-GAM list nodes without CPUs: */ if (pnode == 0xffff) { unsigned long paddr; @@ -1479,15 +1482,30 @@ void __init uv_system_init(void) uv_scir_register_cpu_notifier(); proc_mkdir("sgi_uv", NULL); - /* register Legacy VGA I/O redirection handler */ + /* Register Legacy VGA I/O redirection handler: */ pci_register_set_vga_state(uv_set_vga_state); /* * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as - * EFI is not enabled in the kdump kernel. + * EFI is not enabled in the kdump kernel: */ if (is_kdump_kernel()) reboot_type = BOOT_ACPI; } +/* + * There is a small amount of UV specific code needed to initialize a + * UV system that does not have a "UV HUB" (referred to as "hubless"). + */ +void __init uv_system_init(void) +{ + if (likely(!is_uv_system() && !is_uv_hubless())) + return; + + if (is_uv_system()) + uv_system_init_hub(); + else + uv_nmi_setup_hubless(); +} + apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 45d44c173cf9..446b0d3d4932 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -218,7 +218,8 @@ #include <linux/apm_bios.h> #include <linux/init.h> #include <linux/time.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/cputime.h> #include <linux/pm.h> #include <linux/capability.h> #include <linux/device.h> @@ -608,7 +609,7 @@ static long __apm_bios_call(void *_call) cpu = get_cpu(); BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; @@ -684,7 +685,7 @@ static long __apm_bios_call_simple(void *_call) cpu = get_cpu(); BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; @@ -905,8 +906,8 @@ static int apm_cpu_idle(struct cpuidle_device *dev, { static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ - static unsigned int last_stime; /* = 0 */ - cputime_t stime, utime; + static u64 last_stime; /* = 0 */ + u64 stime, utime; int apm_idle_done = 0; unsigned int jiffies_since_last_check = jiffies - last_jiffies; @@ -919,7 +920,7 @@ recalc: } else if (jiffies_since_last_check > idle_period) { unsigned int idle_percentage; - idle_percentage = cputime_to_jiffies(stime - last_stime); + idle_percentage = nsecs_to_jiffies(stime - last_stime); idle_percentage *= 100; idle_percentage /= jiffies_since_last_check; use_apm_idle = (idle_percentage > idle_threshold); @@ -2351,7 +2352,7 @@ static int __init apm_init(void) * Note we only set APM segments on CPU zero, since we pin the APM * code to that CPU. */ - gdt = get_cpu_gdt_table(0); + gdt = get_cpu_gdt_rw(0); set_desc_base(&gdt[APM_CS >> 3], (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); set_desc_base(&gdt[APM_CS_16 >> 3], diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index c62e015b126c..de827d6ac8c2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -81,6 +81,7 @@ void common(void) { BLANK(); OFFSET(BP_scratch, boot_params, scratch); + OFFSET(BP_secure_boot, boot_params, secure_boot); OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 210927ee2e74..99332f550c48 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -13,6 +13,10 @@ static char syscalls_ia32[] = { #include <asm/syscalls_32.h> }; +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#include <asm/kvm_para.h> +#endif + int main(void) { #ifdef CONFIG_PARAVIRT @@ -22,6 +26,11 @@ int main(void) BLANK(); #endif +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) + OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted); + BLANK(); +#endif + #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(cx); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 52000010c62e..cdf82492b770 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -21,6 +21,7 @@ obj-y += common.o obj-y += rdrand.o obj-y += match.o obj-y += bugs.o +obj-$(CONFIG_CPU_FREQ) += aperfmperf.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1d3167269a67..3b9e220621f8 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include <linux/io.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/random.h> #include <asm/processor.h> #include <asm/apic.h> @@ -15,7 +16,7 @@ #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> -# include <asm/cacheflush.h> +# include <asm/set_memory.h> #endif #include "cpu.h" @@ -133,6 +134,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) n = K6_BUG_LOOP; f_vide = vide; + OPTIMIZER_HIDE_VAR(f_vide); d = rdtsc(); while (n--) f_vide(); @@ -309,8 +311,22 @@ static void amd_get_topology(struct cpuinfo_x86 *c) /* get information required for multi-node processors */ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + u32 eax, ebx, ecx, edx; - node_id = cpuid_ecx(0x8000001e) & 7; + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + + node_id = ecx & 0xff; + smp_num_siblings = ((ebx >> 8) & 0xff) + 1; + + if (c->x86 == 0x15) + c->cu_id = ebx & 0xff; + + if (c->x86 >= 0x17) { + c->cpu_core_id = ebx & 0xff; + + if (smp_num_siblings > 1) + c->x86_max_cores /= smp_num_siblings; + } /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -541,8 +557,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (!check_tsc_unstable()) - set_sched_clock_stable(); } /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ @@ -786,8 +800,9 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); - /* AMD CPUs don't reset SS attributes on SYSRET */ - set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ + if (!cpu_has(c, X86_FEATURE_XENPV)) + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c new file mode 100644 index 000000000000..d869c8671e36 --- /dev/null +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -0,0 +1,79 @@ +/* + * x86 APERF/MPERF KHz calculation for + * /sys/.../cpufreq/scaling_cur_freq + * + * Copyright (C) 2017 Intel Corp. + * Author: Len Brown <[email protected]> + * + * This file is licensed under GPLv2. + */ + +#include <linux/jiffies.h> +#include <linux/math64.h> +#include <linux/percpu.h> +#include <linux/smp.h> + +struct aperfmperf_sample { + unsigned int khz; + unsigned long jiffies; + u64 aperf; + u64 mperf; +}; + +static DEFINE_PER_CPU(struct aperfmperf_sample, samples); + +/* + * aperfmperf_snapshot_khz() + * On the current CPU, snapshot APERF, MPERF, and jiffies + * unless we already did it within 10ms + * calculate kHz, save snapshot + */ +static void aperfmperf_snapshot_khz(void *dummy) +{ + u64 aperf, aperf_delta; + u64 mperf, mperf_delta; + struct aperfmperf_sample *s = this_cpu_ptr(&samples); + + /* Don't bother re-computing within 10 ms */ + if (time_before(jiffies, s->jiffies + HZ/100)) + return; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + aperf_delta = aperf - s->aperf; + mperf_delta = mperf - s->mperf; + + /* + * There is no architectural guarantee that MPERF + * increments faster than we can read it. + */ + if (mperf_delta == 0) + return; + + /* + * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then + * khz = (cpu_khz * aperf_delta) / mperf_delta + */ + if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta) + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); + else /* khz = aperf_delta / (mperf_delta / cpu_khz) */ + s->khz = div64_u64(aperf_delta, + div64_u64(mperf_delta, cpu_khz)); + s->jiffies = jiffies; + s->aperf = aperf; + s->mperf = mperf; +} + +unsigned int arch_freq_get_on_cpu(int cpu) +{ + if (!cpu_khz) + return 0; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return 0; + + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + + return per_cpu(samples.khz, cpu); +} diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a44ef52184df..0af86d9242da 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -17,7 +17,7 @@ #include <asm/paravirt.h> #include <asm/alternative.h> #include <asm/pgtable.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> void __init check_bugs(void) { diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 1661d8ec9280..44207b71fee1 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,8 +1,9 @@ -#include <linux/bitops.h> -#include <linux/kernel.h> + +#include <linux/sched.h> +#include <linux/sched/clock.h> #include <asm/cpufeature.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9bab7a8a4293..c8b39870f33e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -7,7 +7,9 @@ #include <linux/string.h> #include <linux/ctype.h> #include <linux/delay.h> -#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/clock.h> +#include <linux/sched/task.h> #include <linux/init.h> #include <linux/kprobes.h> #include <linux/kgdb.h> @@ -35,6 +37,7 @@ #include <asm/desc.h> #include <asm/fpu/internal.h> #include <asm/mtrr.h> +#include <asm/hwcap2.h> #include <linux/numa.h> #include <asm/asm.h> #include <asm/bugs.h> @@ -51,6 +54,8 @@ #include "cpu.h" +u32 elf_hwcap2 __read_mostly; + /* all of these masks are initialized in setup_cpu_local_masks() */ cpumask_var_t cpu_initialized_mask; cpumask_var_t cpu_callout_mask; @@ -443,19 +448,60 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } +/* Setup the fixmap mapping only once per-processor */ +static inline void setup_fixmap_gdt(int cpu) +{ +#ifdef CONFIG_X86_64 + /* On 64-bit systems, we use a read-only fixmap GDT. */ + pgprot_t prot = PAGE_KERNEL_RO; +#else + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through + * a task gate needs to change an available TSS to busy. If the GDT + * is read-only, that will triple fault. + * + * On Xen PV, the GDT must be read-only because the hypervisor requires + * it. + */ + pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; +#endif + + __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); +} + +/* Load the original GDT from the per-cpu structure */ +void load_direct_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_rw(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_direct_gdt); + +/* Load a fixmap remapping of the per-cpu GDT */ +void load_fixmap_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_ro(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_fixmap_gdt); + /* * Current gdt points %fs at the "master" per-cpu area: after this, * it's on the real one. */ void switch_to_new_gdt(int cpu) { - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(cpu); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); + /* Load the original GDT */ + load_direct_gdt(cpu); /* Reload the per-cpu base */ - load_percpu_segment(cpu); } @@ -655,6 +701,16 @@ void cpu_detect(struct cpuinfo_x86 *c) } } +static void apply_forced_caps(struct cpuinfo_x86 *c) +{ + int i; + + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } +} + void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -748,6 +804,13 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); init_scattered_cpuid_features(c); + + /* + * Clear/Set all flags overridden by options, after probe. + * This needs to happen each time we re-probe, which may happen + * several times during CPU initialization. + */ + apply_forced_caps(c); } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -801,14 +864,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) memset(&c->x86_capability, 0, sizeof c->x86_capability); c->extended_cpuid_level = 0; - if (!have_cpuid_p()) - identify_cpu_without_cpuid(c); - /* cyrix could have cpuid enabled via c_identify()*/ if (have_cpuid_p()) { cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); + setup_force_cpu_cap(X86_FEATURE_CPUID); if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -818,6 +879,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); + } else { + identify_cpu_without_cpuid(c); + setup_clear_cpu_cap(X86_FEATURE_CPUID); } setup_force_cpu_cap(X86_FEATURE_ALWAYS); @@ -1015,6 +1079,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; c->x86_coreid_bits = 0; + c->cu_id = 0xff; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1034,10 +1099,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) this_cpu->c_identify(c); /* Clear/Set all flags overridden by options, after probe */ - for (i = 0; i < NCAPINTS; i++) { - c->x86_capability[i] &= ~cpu_caps_cleared[i]; - c->x86_capability[i] |= cpu_caps_set[i]; - } + apply_forced_caps(c); #ifdef CONFIG_X86_64 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); @@ -1087,7 +1149,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) detect_ht(c); #endif - init_hypervisor(c); x86_init_rdrand(c); x86_init_cache_qos(c); setup_pku(c); @@ -1096,10 +1157,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) * Clear/Set all flags overridden by options, need do it * before following smp all cpus cap AND. */ - for (i = 0; i < NCAPINTS; i++) { - c->x86_capability[i] &= ~cpu_caps_cleared[i]; - c->x86_capability[i] |= cpu_caps_set[i]; - } + apply_forced_caps(c); /* * On SMP, boot_cpu_data holds the common feature set between @@ -1491,7 +1549,7 @@ void cpu_init(void) for (i = 0; i <= IO_BITMAP_LONGS; i++) t->io_bitmap[i] = ~0UL; - atomic_inc(&init_mm.mm_count); + mmgrab(&init_mm); me->active_mm = &init_mm; BUG_ON(me->mm); enter_lazy_tlb(&init_mm, me); @@ -1508,6 +1566,9 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); + + setup_fixmap_gdt(cpu); + load_fixmap_gdt(cpu); } #else @@ -1542,7 +1603,7 @@ void cpu_init(void) /* * Set up and load the per-CPU TSS and LDT */ - atomic_inc(&init_mm.mm_count); + mmgrab(&init_mm); curr->active_mm = &init_mm; BUG_ON(curr->mm); enter_lazy_tlb(&init_mm, curr); @@ -1563,6 +1624,9 @@ void cpu_init(void) dbg_restore_debug_regs(); fpu__init_cpu(); + + setup_fixmap_gdt(cpu); + load_fixmap_gdt(cpu); } #endif diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index bd9dcd6b712d..6f077445647a 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -9,6 +9,8 @@ #include <asm/pci-direct.h> #include <asm/tsc.h> #include <asm/cpufeature.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> #include "cpu.h" @@ -253,6 +255,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) break; case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ + case 11: /* GX1 with inverted Device ID */ #ifdef CONFIG_PCI { u32 vendor, device; diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 35691a6b0d32..4fa90006ac68 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -28,8 +28,11 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { -#ifdef CONFIG_XEN - &x86_hyper_xen, +#ifdef CONFIG_XEN_PV + &x86_hyper_xen_pv, +#endif +#ifdef CONFIG_XEN_PVHVM + &x86_hyper_xen_hvm, #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, @@ -60,12 +63,6 @@ detect_hypervisor_vendor(void) pr_info("Hypervisor detected: %s\n", x86_hyper->name); } -void init_hypervisor(struct cpuinfo_x86 *c) -{ - if (x86_hyper && x86_hyper->set_cpu_features) - x86_hyper->set_cpu_features(c); -} - void __init init_hypervisor_platform(void) { @@ -74,8 +71,6 @@ void __init init_hypervisor_platform(void) if (!x86_hyper) return; - init_hypervisor(&boot_cpu_data); - if (x86_hyper->init_platform) x86_hyper->init_platform(); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 203f860d2ab3..dfa90a3a5145 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -4,6 +4,7 @@ #include <linux/bitops.h> #include <linux/smp.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/thread_info.h> #include <linux/init.h> #include <linux/uaccess.h> @@ -15,6 +16,8 @@ #include <asm/cpu.h> #include <asm/intel-family.h> #include <asm/microcode_intel.h> +#include <asm/hwcap2.h> +#include <asm/elf.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -62,6 +65,42 @@ void check_mpx_erratum(struct cpuinfo_x86 *c) } } +static bool ring3mwait_disabled __read_mostly; + +static int __init ring3mwait_disable(char *__unused) +{ + ring3mwait_disabled = true; + return 0; +} +__setup("ring3mwait=disable", ring3mwait_disable); + +static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) +{ + /* + * Ring 3 MONITOR/MWAIT feature cannot be detected without + * cpu model and family comparison. + */ + if (c->x86 != 6) + return; + switch (c->x86_model) { + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: + break; + default: + return; + } + + if (ring3mwait_disabled) + return; + + set_cpu_cap(c, X86_FEATURE_RING3MWAIT); + this_cpu_or(msr_misc_features_shadow, + 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT); + + if (c == &boot_cpu_data) + ELF_HWCAP2 |= HWCAP2_RING3MWAIT; +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -119,8 +158,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (!check_tsc_unstable()) - set_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ @@ -447,6 +484,34 @@ static void intel_bsp_resume(struct cpuinfo_x86 *c) init_intel_energy_perf(c); } +static void init_cpuid_fault(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) + set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); + } +} + +static void init_intel_misc_features(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + return; + + /* Clear all MISC features */ + this_cpu_write(msr_misc_features_shadow, 0); + + /* Check features and update capabilities and shadow control bits */ + init_cpuid_fault(c); + probe_xeon_phi_r3mwait(c); + + msr = this_cpu_read(msr_misc_features_shadow); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); +} + static void init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -560,6 +625,8 @@ static void init_intel(struct cpuinfo_x86 *c) detect_vmx_virtcap(c); init_intel_energy_perf(c); + + init_intel_misc_features(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0282b0df004a..c55fb2cb2acc 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -11,6 +11,7 @@ #include <linux/cacheinfo.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/capability.h> #include <linux/sysfs.h> #include <linux/pci.h> diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5a533fefefa0..5b366462f579 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -32,55 +32,98 @@ #include <asm/intel-family.h> #include <asm/intel_rdt.h> +#define MAX_MBA_BW 100u +#define MBA_IS_LINEAR 0x4 + /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * Used to store the max resource name width and max resource data width + * to display the schemata in a tabular format + */ +int max_name_width, max_data_width; + +static void +mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); + #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { { - .name = "L3", - .domains = domain_init(RDT_RESOURCE_L3), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 1, - .cbm_idx_offset = 0 + .name = "L3", + .domains = domain_init(RDT_RESOURCE_L3), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + }, + { + .name = "L3DATA", + .domains = domain_init(RDT_RESOURCE_L3DATA), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L3DATA", - .domains = domain_init(RDT_RESOURCE_L3DATA), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 2, - .cbm_idx_offset = 0 + .name = "L3CODE", + .domains = domain_init(RDT_RESOURCE_L3CODE), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 1, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L3CODE", - .domains = domain_init(RDT_RESOURCE_L3CODE), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 2, - .cbm_idx_offset = 1 + .name = "L2", + .domains = domain_init(RDT_RESOURCE_L2), + .msr_base = IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L2", - .domains = domain_init(RDT_RESOURCE_L2), - .msr_base = IA32_L2_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 2, - .cbm_idx_multi = 1, - .cbm_idx_offset = 0 + .name = "MB", + .domains = domain_init(RDT_RESOURCE_MBA), + .msr_base = IA32_MBA_THRTL_BASE, + .msr_update = mba_wrmsr, + .cache_level = 3, + .parse_ctrlval = parse_bw, + .format_str = "%d=%*d", }, }; -static int cbm_idx(struct rdt_resource *r, int closid) +static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) { - return closid * r->cbm_idx_multi + r->cbm_idx_offset; + return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset; } /* @@ -118,9 +161,9 @@ static inline bool cache_alloc_hsw_probe(void) return false; r->num_closid = 4; - r->cbm_len = 20; - r->max_cbm = max_cbm; - r->min_cbm_bits = 2; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.min_cbm_bits = 2; r->capable = true; r->enabled = true; @@ -130,16 +173,66 @@ static inline bool cache_alloc_hsw_probe(void) return false; } -static void rdt_get_config(int idx, struct rdt_resource *r) +/* + * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values + * exposed to user interface and the h/w understandable delay values. + * + * The non-linear delay values have the granularity of power of two + * and also the h/w does not guarantee a curve for configured delay + * values vs. actual b/w enforced. + * Hence we need a mapping that is pre calibrated so the user can + * express the memory b/w as a percentage value. + */ +static inline bool rdt_get_mb_table(struct rdt_resource *r) +{ + /* + * There are no Intel SKUs as of now to support non-linear delay. + */ + pr_info("MBA b/w map not implemented for cpu:%d, model:%d", + boot_cpu_data.x86, boot_cpu_data.x86_model); + + return false; +} + +static bool rdt_get_mem_config(struct rdt_resource *r) +{ + union cpuid_0x10_3_eax eax; + union cpuid_0x10_x_edx edx; + u32 ebx, ecx; + + cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + r->membw.max_delay = eax.split.max_delay + 1; + r->default_ctrl = MAX_MBA_BW; + if (ecx & MBA_IS_LINEAR) { + r->membw.delay_linear = true; + r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay; + r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay; + } else { + if (!rdt_get_mb_table(r)) + return false; + } + r->data_width = 3; + rdt_get_mba_infofile(r); + + r->capable = true; + r->enabled = true; + + return true; +} + +static void rdt_get_cache_config(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; - union cpuid_0x10_1_edx edx; + union cpuid_0x10_x_edx edx; u32 ebx, ecx; cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); r->num_closid = edx.split.cos_max + 1; - r->cbm_len = eax.split.cbm_len + 1; - r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.cbm_len = eax.split.cbm_len + 1; + r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->data_width = (r->cache.cbm_len + 3) / 4; + rdt_get_cache_infofile(r); r->capable = true; r->enabled = true; } @@ -150,8 +243,9 @@ static void rdt_get_cdp_l3_config(int type) struct rdt_resource *r = &rdt_resources_all[type]; r->num_closid = r_l3->num_closid / 2; - r->cbm_len = r_l3->cbm_len; - r->max_cbm = r_l3->max_cbm; + r->cache.cbm_len = r_l3->cache.cbm_len; + r->default_ctrl = r_l3->default_ctrl; + r->data_width = (r->cache.cbm_len + 3) / 4; r->capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter @@ -160,33 +254,6 @@ static void rdt_get_cdp_l3_config(int type) r->enabled = false; } -static inline bool get_rdt_resources(void) -{ - bool ret = false; - - if (cache_alloc_hsw_probe()) - return true; - - if (!boot_cpu_has(X86_FEATURE_RDT_A)) - return false; - - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { - rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); - rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); - } - ret = true; - } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { - /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); - ret = true; - } - - return ret; -} - static int get_cache_id(int cpu, int level) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); @@ -200,29 +267,55 @@ static int get_cache_id(int cpu, int level) return -1; } -void rdt_cbm_update(void *arg) +/* + * Map the memory b/w percentage value to delay values + * that can be written to QOS_MSRs. + * There are currently no SKUs which support non linear delay values. + */ +static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { - struct msr_param *m = (struct msr_param *)arg; + if (r->membw.delay_linear) + return MAX_MBA_BW - bw; + + pr_warn_once("Non Linear delay-bw map not supported but queried\n"); + return r->default_ctrl; +} + +static void +mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + /* Write the delay values for mba. */ + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r)); +} + +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); +} + +void rdt_ctrl_update(void *arg) +{ + struct msr_param *m = arg; struct rdt_resource *r = m->res; - int i, cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct rdt_domain *d; list_for_each_entry(d, &r->domains, list) { /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) - goto found; + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + r->msr_update(d, m, r); + return; + } } - pr_info_once("cpu %d not found in any domain for resource %s\n", + pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); - - return; - -found: - for (i = m->low; i < m->high; i++) { - int idx = cbm_idx(r, i); - - wrmsrl(r->msr_base + idx, d->cbm[i]); - } } /* @@ -258,6 +351,32 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, return NULL; } +static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) +{ + struct msr_param m; + u32 *dc; + int i; + + dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); + if (!dc) + return -ENOMEM; + + d->ctrl_val = dc; + + /* + * Initialize the Control MSRs to having no control. + * For Cache Allocation: Set all bits in cbm + * For Memory Allocation: Set b/w requested to 100 + */ + for (i = 0; i < r->num_closid; i++, dc++) + *dc = r->default_ctrl; + + m.low = 0; + m.high = r->num_closid; + r->msr_update(d, &m, r); + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -273,7 +392,7 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, */ static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int i, id = get_cache_id(cpu, r->cache_level); + int id = get_cache_id(cpu, r->cache_level); struct list_head *add_pos = NULL; struct rdt_domain *d; @@ -294,22 +413,13 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) d->id = id; - d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL); - if (!d->cbm) { + if (domain_setup_ctrlval(r, d)) { kfree(d); return; } - for (i = 0; i < r->num_closid; i++) { - int idx = cbm_idx(r, i); - - d->cbm[i] = r->max_cbm; - wrmsrl(r->msr_base + idx, d->cbm[i]); - } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); - r->num_domains++; } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -325,8 +435,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { - r->num_domains--; - kfree(d->cbm); + kfree(d->ctrl_val); list_del(&d->list); kfree(d); } @@ -374,6 +483,57 @@ static int intel_rdt_offline_cpu(unsigned int cpu) return 0; } +/* + * Choose a width for the resource name and resource data based on the + * resource that has widest name and cbm. + */ +static __init void rdt_init_padding(void) +{ + struct rdt_resource *r; + int cl; + + for_each_capable_rdt_resource(r) { + cl = strlen(r->name); + if (cl > max_name_width) + max_name_width = cl; + + if (r->data_width > max_data_width) + max_data_width = r->data_width; + } +} + +static __init bool get_rdt_resources(void) +{ + bool ret = false; + + if (cache_alloc_hsw_probe()) + return true; + + if (!boot_cpu_has(X86_FEATURE_RDT_A)) + return false; + + if (boot_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); + rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); + } + ret = true; + } + if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + /* CPUID 0x10.2 fields are same format at 0x10.1 */ + rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + ret = true; + } + + if (boot_cpu_has(X86_FEATURE_MBA)) { + if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) + ret = true; + } + + return ret; +} + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -382,6 +542,8 @@ static int __init intel_rdt_late_init(void) if (!get_rdt_resources()) return -ENODEV; + rdt_init_padding(); + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rdt/cat:online:", intel_rdt_online_cpu, intel_rdt_offline_cpu); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 8af04afdfcb9..9257bd9dc664 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -25,9 +25,9 @@ #include <linux/sysfs.h> #include <linux/kernfs.h> #include <linux/seq_file.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> #include <linux/slab.h> -#include <linux/cpu.h> #include <linux/task_work.h> #include <uapi/linux/magic.h> @@ -174,6 +174,13 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; +static bool is_cpu_list(struct kernfs_open_file *of) +{ + struct rftype *rft = of->kn->priv; + + return rft->flags & RFTYPE_FLAGS_CPUS_LIST; +} + static int rdtgroup_cpus_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { @@ -182,10 +189,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); - else + if (rdtgrp) { + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->cpu_mask)); + } else { ret = -ENOENT; + } rdtgroup_kn_unlock(of->kn); return ret; @@ -252,7 +261,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } - ret = cpumask_parse(buf, newmask); + if (is_cpu_list(of)) + ret = cpulist_parse(buf, newmask); + else + ret = cpumask_parse(buf, newmask); + if (ret) goto unlock; @@ -473,6 +486,14 @@ static struct rftype rdtgroup_base_files[] = { .seq_show = rdtgroup_cpus_show, }, { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + }, + { .name = "tasks", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -494,32 +515,56 @@ static int rdt_num_closids_show(struct kernfs_open_file *of, struct rdt_resource *r = of->kn->parent->priv; seq_printf(seq, "%d\n", r->num_closid); + return 0; +} +static int rdt_default_ctrl_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->default_ctrl); return 0; } -static int rdt_cbm_mask_show(struct kernfs_open_file *of, +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct rdt_resource *r = of->kn->parent->priv; - seq_printf(seq, "%x\n", r->max_cbm); + seq_printf(seq, "%u\n", r->cache.min_cbm_bits); + return 0; +} + +static int rdt_min_bw_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + seq_printf(seq, "%u\n", r->membw.min_bw); return 0; } -static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, +static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct rdt_resource *r = of->kn->parent->priv; - seq_printf(seq, "%d\n", r->min_cbm_bits); + seq_printf(seq, "%u\n", r->membw.bw_gran); + return 0; +} +static int rdt_delay_linear_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", r->membw.delay_linear); return 0; } /* rdtgroup information files for one cache resource. */ -static struct rftype res_info_files[] = { +static struct rftype res_cache_info_files[] = { { .name = "num_closids", .mode = 0444, @@ -530,7 +575,7 @@ static struct rftype res_info_files[] = { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_cbm_mask_show, + .seq_show = rdt_default_ctrl_show, }, { .name = "min_cbm_bits", @@ -540,11 +585,52 @@ static struct rftype res_info_files[] = { }, }; +/* rdtgroup information files for memory bandwidth. */ +static struct rftype res_mba_info_files[] = { + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + }, + { + .name = "min_bandwidth", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + }, + { + .name = "bandwidth_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + }, + { + .name = "delay_linear", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_delay_linear_show, + }, +}; + +void rdt_get_mba_infofile(struct rdt_resource *r) +{ + r->info_files = res_mba_info_files; + r->nr_info_files = ARRAY_SIZE(res_mba_info_files); +} + +void rdt_get_cache_infofile(struct rdt_resource *r) +{ + r->info_files = res_cache_info_files; + r->nr_info_files = ARRAY_SIZE(res_cache_info_files); +} + static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { struct kernfs_node *kn_subdir; + struct rftype *res_info_files; struct rdt_resource *r; - int ret; + int ret, len; /* create the directory */ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); @@ -563,8 +649,11 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) ret = rdtgroup_kn_set_ugid(kn_subdir); if (ret) goto out_destroy; - ret = rdtgroup_add_files(kn_subdir, res_info_files, - ARRAY_SIZE(res_info_files)); + + res_info_files = r->info_files; + len = r->nr_info_files; + + ret = rdtgroup_add_files(kn_subdir, res_info_files, len); if (ret) goto out_destroy; kernfs_activate(kn_subdir); @@ -727,7 +816,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { kernfs_unbreak_active_protection(kn); - kernfs_put(kn); + kernfs_put(rdtgrp->kn); kfree(rdtgrp); } else { kernfs_unbreak_active_protection(kn); @@ -767,11 +856,13 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) - goto out_cdp; + goto out_destroy; static_branch_enable(&rdt_enable_key); goto out; +out_destroy: + kernfs_remove(kn_info); out_cdp: cdp_disable(); out: @@ -780,7 +871,7 @@ out: return dentry; } -static int reset_all_cbms(struct rdt_resource *r) +static int reset_all_ctrls(struct rdt_resource *r) { struct msr_param msr_param; cpumask_var_t cpu_mask; @@ -803,14 +894,14 @@ static int reset_all_cbms(struct rdt_resource *r) cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); for (i = 0; i < r->num_closid; i++) - d->cbm[i] = r->max_cbm; + d->ctrl_val[i] = r->default_ctrl; } cpu = get_cpu(); /* Update CBM on this cpu if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_cbm_update(&msr_param); + rdt_ctrl_update(&msr_param); /* Update CBM on all other cpus in cpu_mask. */ - smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); free_cpumask_var(cpu_mask); @@ -896,7 +987,7 @@ static void rdt_kill_sb(struct super_block *sb) /*Put everything back to default values. */ for_each_enabled_rdt_resource(r) - reset_all_cbms(r); + reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); static_branch_disable(&rdt_enable_key); diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c index f369cb8db0d5..406d7a6532f9 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c @@ -29,26 +29,77 @@ #include <asm/intel_rdt.h> /* + * Check whether MBA bandwidth percentage value is correct. The value is + * checked against the minimum and max bandwidth values specified by the + * hardware. The allocated bandwidth percentage is rounded to the next + * control step available on the hardware. + */ +static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) +{ + unsigned long bw; + int ret; + + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (!r->membw.delay_linear) + return false; + + ret = kstrtoul(buf, 10, &bw); + if (ret) + return false; + + if (bw < r->membw.min_bw || bw > r->default_ctrl) + return false; + + *data = roundup(bw, (unsigned long)r->membw.bw_gran); + return true; +} + +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) +{ + unsigned long data; + + if (d->have_new_ctrl) + return -EINVAL; + + if (!bw_validate(buf, &data, r)) + return -EINVAL; + d->new_ctrl = data; + d->have_new_ctrl = true; + + return 0; +} + +/* * Check whether a cache bit mask is valid. The SDM says: * Please note that all (and only) contiguous '1' combinations * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). * Additionally Haswell requires at least two bits set. */ -static bool cbm_validate(unsigned long var, struct rdt_resource *r) +static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) { - unsigned long first_bit, zero_bit; + unsigned long first_bit, zero_bit, val; + unsigned int cbm_len = r->cache.cbm_len; + int ret; + + ret = kstrtoul(buf, 16, &val); + if (ret) + return false; - if (var == 0 || var > r->max_cbm) + if (val == 0 || val > r->default_ctrl) return false; - first_bit = find_first_bit(&var, r->cbm_len); - zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit); + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len) + if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) return false; - if ((zero_bit - first_bit) < r->min_cbm_bits) + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) return false; + + *data = val; return true; } @@ -56,17 +107,17 @@ static bool cbm_validate(unsigned long var, struct rdt_resource *r) * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -static int parse_cbm(char *buf, struct rdt_resource *r) +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; - int ret; - ret = kstrtoul(buf, 16, &data); - if (ret) - return ret; - if (!cbm_validate(data, r)) + if (d->have_new_ctrl) return -EINVAL; - r->tmp_cbms[r->num_tmp_cbms++] = data; + + if(!cbm_validate(buf, &data, r)) + return -EINVAL; + d->new_ctrl = data; + d->have_new_ctrl = true; return 0; } @@ -74,8 +125,8 @@ static int parse_cbm(char *buf, struct rdt_resource *r) /* * For each domain in this resource we expect to find a series of: * id=mask - * separated by ";". The "id" is in decimal, and must appear in the - * right order. + * separated by ";". The "id" is in decimal, and must match one of + * the "id"s for this resource. */ static int parse_line(char *line, struct rdt_resource *r) { @@ -83,21 +134,22 @@ static int parse_line(char *line, struct rdt_resource *r) struct rdt_domain *d; unsigned long dom_id; +next: + if (!line || line[0] == '\0') + return 0; + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) + return -EINVAL; + dom = strim(dom); list_for_each_entry(d, &r->domains, list) { - dom = strsep(&line, ";"); - if (!dom) - return -EINVAL; - id = strsep(&dom, "="); - if (kstrtoul(id, 10, &dom_id) || dom_id != d->id) - return -EINVAL; - if (parse_cbm(dom, r)) - return -EINVAL; + if (d->id == dom_id) { + if (r->parse_ctrlval(dom, r, d)) + return -EINVAL; + goto next; + } } - - /* Any garbage at the end of the line? */ - if (line && line[0]) - return -EINVAL; - return 0; + return -EINVAL; } static int update_domains(struct rdt_resource *r, int closid) @@ -105,7 +157,7 @@ static int update_domains(struct rdt_resource *r, int closid) struct msr_param msr_param; cpumask_var_t cpu_mask; struct rdt_domain *d; - int cpu, idx = 0; + int cpu; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -115,30 +167,46 @@ static int update_domains(struct rdt_resource *r, int closid) msr_param.res = r; list_for_each_entry(d, &r->domains, list) { - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); - d->cbm[msr_param.low] = r->tmp_cbms[idx++]; + if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + d->ctrl_val[closid] = d->new_ctrl; + } } + if (cpumask_empty(cpu_mask)) + goto done; cpu = get_cpu(); /* Update CBM on this cpu if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_cbm_update(&msr_param); + rdt_ctrl_update(&msr_param); /* Update CBM on other cpus. */ - smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); +done: free_cpumask_var(cpu_mask); return 0; } +static int rdtgroup_parse_resource(char *resname, char *tok, int closid) +{ + struct rdt_resource *r; + + for_each_enabled_rdt_resource(r) { + if (!strcmp(resname, r->name) && closid < r->num_closid) + return parse_line(tok, r); + } + return -EINVAL; +} + ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + struct rdt_domain *dom; struct rdt_resource *r; char *tok, *resname; int closid, ret = 0; - u32 *l3_cbms = NULL; /* Valid input requires a trailing newline */ if (nbytes == 0 || buf[nbytes - 1] != '\n') @@ -153,44 +221,20 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, closid = rdtgrp->closid; - /* get scratch space to save all the masks while we validate input */ for_each_enabled_rdt_resource(r) { - r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms), - GFP_KERNEL); - if (!r->tmp_cbms) { - ret = -ENOMEM; - goto out; - } - r->num_tmp_cbms = 0; + list_for_each_entry(dom, &r->domains, list) + dom->have_new_ctrl = false; } while ((tok = strsep(&buf, "\n")) != NULL) { - resname = strsep(&tok, ":"); + resname = strim(strsep(&tok, ":")); if (!tok) { ret = -EINVAL; goto out; } - for_each_enabled_rdt_resource(r) { - if (!strcmp(resname, r->name) && - closid < r->num_closid) { - ret = parse_line(tok, r); - if (ret) - goto out; - break; - } - } - if (!r->name) { - ret = -EINVAL; - goto out; - } - } - - /* Did the parser find all the masks we need? */ - for_each_enabled_rdt_resource(r) { - if (r->num_tmp_cbms != r->num_domains) { - ret = -EINVAL; + ret = rdtgroup_parse_resource(resname, tok, closid); + if (ret) goto out; - } } for_each_enabled_rdt_resource(r) { @@ -201,10 +245,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, out: rdtgroup_kn_unlock(of->kn); - for_each_enabled_rdt_resource(r) { - kfree(r->tmp_cbms); - r->tmp_cbms = NULL; - } return ret ?: nbytes; } @@ -213,11 +253,12 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) struct rdt_domain *dom; bool sep = false; - seq_printf(s, "%s:", r->name); + seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(dom, &r->domains, list) { if (sep) seq_puts(s, ";"); - seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]); + seq_printf(s, r->format_str, dom->id, max_data_width, + dom->ctrl_val[closid]); sep = true; } seq_puts(s, "\n"); diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index a3311c886194..43051f0777d4 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o obj-$(CONFIG_ACPI_APEI) += mce-apei.o + +obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c new file mode 100644 index 000000000000..10cec43aac38 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -0,0 +1,430 @@ +/* + * /dev/mcelog driver + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/miscdevice.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/poll.h> + +#include "mce-internal.h" + +static BLOCKING_NOTIFIER_HEAD(mce_injector_chain); + +static DEFINE_MUTEX(mce_chrdev_read_mutex); + +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; + +#define mce_log_get_idx_check(p) \ +({ \ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&mce_chrdev_read_mutex), \ + "suspicious mce_log_get_idx_check() usage"); \ + smp_load_acquire(&(p)); \ +}) + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log_buffer mcelog = { + .signature = MCE_LOG_SIGNATURE, + .len = MCE_LOG_LEN, + .recordlen = sizeof(struct mce), +}; + +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + +/* User mode helper program triggered by machine check event */ +extern char mce_helper[128]; + +static int dev_mce_log(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned int next, entry; + + wmb(); + for (;;) { + entry = mce_log_get_idx_check(mcelog.next); + for (;;) { + + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, + (unsigned long *)&mcelog.flags); + return NOTIFY_OK; + } + /* Old left over entry. Skip: */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; + } + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + wmb(); + mcelog.entry[entry].finished = 1; + wmb(); + + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); + + return NOTIFY_OK; +} + +static struct notifier_block dev_mcelog_nb = { + .notifier_call = dev_mce_log, + .priority = MCE_PRIO_MCELOG, +}; + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + + +void mce_work_trigger(void) +{ + if (mce_helper[0]) + schedule_work(&mce_trigger_work); +} + +static ssize_t +show_trigger(struct device *s, struct device_attribute *attr, char *buf) +{ + strcpy(buf, mce_helper); + strcat(buf, "\n"); + return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + p = strchr(mce_helper, '\n'); + + if (p) + *p = 0; + + return strlen(mce_helper) + !!p; +} + +DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); + +/* + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ + +static int mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; + + spin_unlock(&mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; + + spin_unlock(&mce_chrdev_state_lock); + + return 0; +} + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + + cpu_tsc[smp_processor_id()] = rdtsc(); +} + +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned long *cpu_tsc; + unsigned prev, next; + int i, err; + + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + + mutex_lock(&mce_chrdev_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + + next = mce_log_get_idx_check(mcelog.next); + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + goto out; + + err = 0; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + struct mce *m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(m, 0, sizeof(*m)); + goto timeout; + } + cpu_relax(); + } + smp_rmb(); + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); +timeout: + ; + } + + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); + + synchronize_sched(); + + /* + * Collect entries that were still getting written before the + * synchronize. + */ + on_each_cpu(collect_tscs, cpu_tsc, 1); + + for (i = next; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + + if (m->finished && m->tsc < cpu_tsc[m->cpu]) { + err |= copy_to_user(buf, m, sizeof(*m)); + smp_rmb(); + buf += sizeof(*m); + memset(m, 0, sizeof(*m)); + } + } + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mce_chrdev_read_mutex); + kfree(cpu_tsc); + + return err ? err : buf - ubuf; +} + +static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_chrdev_wait, wait); + if (READ_ONCE(mcelog.next)) + return POLLIN | POLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return POLLIN | POLLRDNORM; + return 0; +} + +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +void mce_register_injector_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&mce_injector_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_register_injector_chain); + +void mce_unregister_injector_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&mce_injector_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_injector_chain); + +static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) + return -EINVAL; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + */ + schedule_timeout(2); + + blocking_notifier_call_chain(&mce_injector_chain, 0, &m); + + return usize; +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .write = mce_chrdev_write, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +static __init int dev_mcelog_init_device(void) +{ + int err; + + /* register character device /dev/mcelog */ + err = misc_register(&mce_chrdev_device); + if (err) { + if (err == -EBUSY) + /* Xen dom0 might have registered the device already. */ + pr_info("Unable to init device /dev/mcelog, already registered"); + else + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + + return err; + } + + mce_register_decode_chain(&dev_mcelog_nb); + return 0; +} +device_initcall_sync(dev_mcelog_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 83f1a98d37db..2eee85379689 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -52,8 +52,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) if (severity >= GHES_SEV_RECOVERABLE) m.status |= MCI_STATUS_UC; - if (severity >= GHES_SEV_PANIC) + + if (severity >= GHES_SEV_PANIC) { m.status |= MCI_STATUS_PCC; + m.tsc = rdtsc(); + } m.addr = mem_err->physical_addr; mce_log(&m); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 93d824ec3120..217cd4449bc9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -72,7 +72,7 @@ struct llist_node *mce_gen_pool_prepare_records(void) return new_head.first; } -void mce_gen_pool_process(void) +void mce_gen_pool_process(struct work_struct *__unused) { struct llist_node *head; struct mce_evt_llist *node, *tmp; @@ -85,7 +85,7 @@ void mce_gen_pool_process(void) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { mce = &node->mce; - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 517619ea6498..231ad23b24a9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -10,23 +10,105 @@ * Authors: * Andi Kleen * Ying Huang + * + * The AMD part (from mce_amd_inj.c): a simple MCE injection facility + * for testing different aspects of the RAS code. This driver should be + * built as module so that it can be loaded on production kernels for + * testing purposes. + * + * This file may be distributed under the terms of the GNU General Public + * License version 2. + * + * Copyright (c) 2010-17: Borislav Petkov <[email protected]> + * Advanced Micro Devices Inc. */ -#include <linux/uaccess.h> -#include <linux/module.h> -#include <linux/timer.h> + +#include <linux/cpu.h> +#include <linux/debugfs.h> #include <linux/kernel.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/preempt.h> -#include <linux/smp.h> +#include <linux/module.h> #include <linux/notifier.h> -#include <linux/kdebug.h> -#include <linux/cpu.h> -#include <linux/sched.h> -#include <linux/gfp.h> -#include <asm/mce.h> +#include <linux/pci.h> +#include <linux/uaccess.h> + +#include <asm/amd_nb.h> #include <asm/apic.h> +#include <asm/irq_vectors.h> +#include <asm/mce.h> #include <asm/nmi.h> +#include <asm/smp.h> + +#include "mce-internal.h" + +/* + * Collect all the MCi_XXX settings + */ +static struct mce i_mce; +static struct dentry *dfs_inj; + +static u8 n_banks; + +#define MAX_FLAG_OPT_SIZE 3 +#define NBCFG 0x44 + +enum injection_type { + SW_INJ = 0, /* SW injection, simply decode the error */ + HW_INJ, /* Trigger a #MC */ + DFR_INT_INJ, /* Trigger Deferred error interrupt */ + THR_INT_INJ, /* Trigger threshold interrupt */ + N_INJ_TYPES, +}; + +static const char * const flags_options[] = { + [SW_INJ] = "sw", + [HW_INJ] = "hw", + [DFR_INT_INJ] = "df", + [THR_INT_INJ] = "th", + NULL +}; + +/* Set default injection to SW_INJ */ +static enum injection_type inj_type = SW_INJ; + +#define MCE_INJECT_SET(reg) \ +static int inj_##reg##_set(void *data, u64 val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + m->reg = val; \ + return 0; \ +} + +MCE_INJECT_SET(status); +MCE_INJECT_SET(misc); +MCE_INJECT_SET(addr); +MCE_INJECT_SET(synd); + +#define MCE_INJECT_GET(reg) \ +static int inj_##reg##_get(void *data, u64 *val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + *val = m->reg; \ + return 0; \ +} + +MCE_INJECT_GET(status); +MCE_INJECT_GET(misc); +MCE_INJECT_GET(addr); +MCE_INJECT_GET(synd); + +DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); + +static void setup_inj_struct(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + + m->cpuvendor = boot_cpu_data.x86_vendor; +} /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -143,7 +225,7 @@ static int raise_local(void) return ret; } -static void raise_mce(struct mce *m) +static void __maybe_unused raise_mce(struct mce *m) { int context = MCJ_CTX(m->inject_flags); @@ -152,7 +234,6 @@ static void raise_mce(struct mce *m) if (context == MCJ_CTX_RANDOM) return; -#ifdef CONFIG_X86_LOCAL_APIC if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; @@ -192,64 +273,461 @@ static void raise_mce(struct mce *m) raise_local(); put_cpu(); put_online_cpus(); - } else -#endif - { + } else { preempt_disable(); raise_local(); preempt_enable(); } } -/* Error injection interface */ -static ssize_t mce_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) +static int mce_inject_raise(struct notifier_block *nb, unsigned long val, + void *data) { - struct mce m; + struct mce *m = (struct mce *)data; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - /* - * There are some cases where real MSR reads could slip - * through. - */ - if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) - return -EIO; + if (!m) + return NOTIFY_DONE; + + mutex_lock(&mce_inject_mutex); + raise_mce(m); + mutex_unlock(&mce_inject_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block inject_nb = { + .notifier_call = mce_inject_raise, +}; + +/* + * Caller needs to be make sure this cpu doesn't disappear + * from under us, i.e.: get_cpu/put_cpu. + */ +static int toggle_hw_mce_inject(unsigned int cpu, bool enable) +{ + u32 l, h; + int err; + + err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); + if (err) { + pr_err("%s: error reading HWCR\n", __func__); + return err; + } + + enable ? (l |= BIT(18)) : (l &= ~BIT(18)); - if ((unsigned long)usize > sizeof(struct mce)) - usize = sizeof(struct mce); - if (copy_from_user(&m, ubuf, usize)) + err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); + if (err) + pr_err("%s: error writing HWCR\n", __func__); + + return err; +} + +static int __set_inj(const char *buf) +{ + int i; + + for (i = 0; i < N_INJ_TYPES; i++) { + if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + inj_type = i; + return 0; + } + } + return -EINVAL; +} + +static ssize_t flags_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE]; + int n; + + n = sprintf(buf, "%s\n", flags_options[inj_type]); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static ssize_t flags_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE], *__buf; + int err; + + if (cnt > MAX_FLAG_OPT_SIZE) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; - if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) + buf[cnt - 1] = 0; + + /* strip whitespace */ + __buf = strstrip(buf); + + err = __set_inj(__buf); + if (err) { + pr_err("%s: Invalid flags value: %s\n", __func__, __buf); + return err; + } + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations flags_fops = { + .read = flags_read, + .write = flags_write, + .llseek = generic_file_llseek, +}; + +/* + * On which CPU to inject? + */ +MCE_INJECT_GET(extcpu); + +static int inj_extcpu_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= nr_cpu_ids || !cpu_online(val)) { + pr_err("%s: Invalid CPU: %llu\n", __func__, val); return -EINVAL; + } + m->extcpu = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); + +static void trigger_mce(void *info) +{ + asm volatile("int $18"); +} + +static void trigger_dfr_int(void *info) +{ + asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); +} + +static void trigger_thr_int(void *info) +{ + asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); +} + +static u32 get_nbc_for_node(int node_id) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 cores_per_node; + + cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); + + return cores_per_node * node_id; +} + +static void toggle_nb_mca_mst_cpu(u16 nid) +{ + struct amd_northbridge *nb; + struct pci_dev *F3; + u32 val; + int err; + + nb = node_to_amd_nb(nid); + if (!nb) + return; + + F3 = nb->misc; + if (!F3) + return; + + err = pci_read_config_dword(F3, NBCFG, &val); + if (err) { + pr_err("%s: Error reading F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); + return; + } + + if (val & BIT(27)) + return; + + pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", + __func__); + + val |= BIT(27); + err = pci_write_config_dword(F3, NBCFG, val); + if (err) + pr_err("%s: Error writing F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); +} + +static void prepare_msrs(void *info) +{ + struct mce m = *(struct mce *)info; + u8 b = m.bank; + + wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (m.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + } else { + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + } + + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + } else { + wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + } +} + +static void do_inject(void) +{ + u64 mcg_status = 0; + unsigned int cpu = i_mce.extcpu; + u8 b = i_mce.bank; + + rdtscll(i_mce.tsc); + + if (i_mce.misc) + i_mce.status |= MCI_STATUS_MISCV; + + if (i_mce.synd) + i_mce.status |= MCI_STATUS_SYNDV; + + if (inj_type == SW_INJ) { + mce_inject_log(&i_mce); + return; + } + + /* prep MCE global settings for the injection */ + mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; + + if (!(i_mce.status & MCI_STATUS_PCC)) + mcg_status |= MCG_STATUS_RIPV; /* - * Need to give user space some time to set everything up, - * so do it a jiffie or two later everywhere. + * Ensure necessary status bits for deferred errors: + * - MCx_STATUS[Deferred]: make sure it is a deferred error + * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC */ - schedule_timeout(2); + if (inj_type == DFR_INT_INJ) { + i_mce.status |= MCI_STATUS_DEFERRED; + i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + } - mutex_lock(&mce_inject_mutex); - raise_mce(&m); - mutex_unlock(&mce_inject_mutex); - return usize; + /* + * For multi node CPUs, logging and reporting of bank 4 errors happens + * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for + * Fam10h and later BKDGs. + */ + if (static_cpu_has(X86_FEATURE_AMD_DCM) && + b == 4 && + boot_cpu_data.x86 < 0x17) { + toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); + cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + } + + get_online_cpus(); + if (!cpu_online(cpu)) + goto err; + + toggle_hw_mce_inject(cpu, true); + + i_mce.mcgstatus = mcg_status; + i_mce.inject_flags = inj_type; + smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); + + toggle_hw_mce_inject(cpu, false); + + switch (inj_type) { + case DFR_INT_INJ: + smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); + break; + case THR_INT_INJ: + smp_call_function_single(cpu, trigger_thr_int, NULL, 0); + break; + default: + smp_call_function_single(cpu, trigger_mce, NULL, 0); + } + +err: + put_online_cpus(); + +} + +/* + * This denotes into which bank we're injecting and triggers + * the injection, at the same time. + */ +static int inj_bank_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= n_banks) { + pr_err("Non-existent MCE bank: %llu\n", val); + return -EINVAL; + } + + m->bank = val; + do_inject(); + + return 0; +} + +MCE_INJECT_GET(bank); + +DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); + +static const char readme_msg[] = +"Description of the files and their usages:\n" +"\n" +"Note1: i refers to the bank number below.\n" +"Note2: See respective BKDGs for the exact bit definitions of the files below\n" +"as they mirror the hardware registers.\n" +"\n" +"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" +"\t attributes of the error which caused the MCE.\n" +"\n" +"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" +"\t used for error thresholding purposes and its validity is indicated by\n" +"\t MCi_STATUS[MiscV].\n" +"\n" +"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" +"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" +"\n" +"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" +"\t associated with the error.\n" +"\n" +"cpu:\t The CPU to inject the error on.\n" +"\n" +"bank:\t Specify the bank you want to inject the error into: the number of\n" +"\t banks in a processor varies and is family/model-specific, therefore, the\n" +"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" +"\t injection.\n" +"\n" +"flags:\t Injection type to be performed. Writing to this file will trigger a\n" +"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" +"\t for AMD processors.\n" +"\n" +"\t Allowed error injection types:\n" +"\t - \"sw\": Software error injection. Decode error to a human-readable \n" +"\t format only. Safe to use.\n" +"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" +"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" +"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" +"\t before injecting.\n" +"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" +"\t error APIC interrupt handler to handle the error if the feature is \n" +"\t is present in hardware. \n" +"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" +"\t APIC interrupt handler to handle the error. \n" +"\n"; + +static ssize_t +inj_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations readme_fops = { + .read = inj_readme_read, +}; + +static struct dfs_node { + char *name; + struct dentry *d; + const struct file_operations *fops; + umode_t perm; +} dfs_fls[] = { + { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, +}; + +static int __init debugfs_init(void) +{ + unsigned int i; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + n_banks = cap & MCG_BANKCNT_MASK; + + dfs_inj = debugfs_create_dir("mce-inject", NULL); + if (!dfs_inj) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { + dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, + dfs_fls[i].perm, + dfs_inj, + &i_mce, + dfs_fls[i].fops); + + if (!dfs_fls[i].d) + goto err_dfs_add; + } + + return 0; + +err_dfs_add: + while (i-- > 0) + debugfs_remove(dfs_fls[i].d); + + debugfs_remove(dfs_inj); + dfs_inj = NULL; + + return -ENODEV; } -static int inject_init(void) +static int __init inject_init(void) { + int err; + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; + + err = debugfs_init(); + if (err) { + free_cpumask_var(mce_inject_cpumask); + return err; + } + + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); + mce_register_injector_chain(&inject_nb); + + setup_inj_struct(&i_mce); + pr_info("Machine check injector initialized\n"); - register_mce_write_callback(mce_write); - register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, - "mce_notify"); + return 0; } +static void __exit inject_exit(void) +{ + + mce_unregister_injector_chain(&inject_nb); + unregister_nmi_handler(NMI_LOCAL, "mce_notify"); + + debugfs_remove_recursive(dfs_inj); + dfs_inj = NULL; + + memset(&dfs_fls, 0, sizeof(dfs_fls)); + + free_cpumask_var(mce_inject_cpumask); +} + module_init(inject_init); -/* - * Cannot tolerate unloading currently because we cannot - * guarantee all openers of mce_chrdev will get a reference to us. - */ +module_exit(inject_exit); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index cd74a3f00aea..098530a93bb7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -13,7 +13,7 @@ enum severity_level { MCE_PANIC_SEVERITY, }; -extern struct atomic_notifier_head x86_mce_decoder_chain; +extern struct blocking_notifier_head x86_mce_decoder_chain; #define ATTR_LEN 16 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ @@ -31,7 +31,7 @@ struct mce_evt_llist { struct mce mce; }; -void mce_gen_pool_process(void); +void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); @@ -96,3 +96,15 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2) m1->addr != m2->addr || m1->misc != m2->misc; } + +extern struct device_attribute dev_attr_trigger; + +#ifdef CONFIG_X86_MCELOG_LEGACY +void mce_work_trigger(void); +void mce_register_injector_chain(struct notifier_block *nb); +void mce_unregister_injector_chain(struct notifier_block *nb); +#else +static inline void mce_work_trigger(void) { } +static inline void mce_register_injector_chain(struct notifier_block *nb) { } +static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 537c6647d84c..6dde0497efc7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -35,6 +35,7 @@ #include <linux/poll.h> #include <linux/nmi.h> #include <linux/cpu.h> +#include <linux/ras.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/mm.h> @@ -49,18 +50,11 @@ #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/reboot.h> #include "mce-internal.h" -static DEFINE_MUTEX(mce_chrdev_read_mutex); - -#define mce_log_get_idx_check(p) \ -({ \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ - !lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious mce_log_get_idx_check() usage"); \ - smp_load_acquire(&(p)); \ -}) +static DEFINE_MUTEX(mce_log_mutex); #define CREATE_TRACE_POINTS #include <trace/events/mce.h> @@ -85,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -/* User mode helper program triggered by machine check event */ -static unsigned long mce_need_notify; -static char mce_helper[128]; -static char *mce_helper_argv[2] = { mce_helper, NULL }; - -static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); - static DEFINE_PER_CPU(struct mce, mces_seen); -static int cpu_missing; +static unsigned long mce_need_notify; +static int cpu_missing; /* * MCA banks polled by the period polling timer for corrected events. @@ -121,14 +109,13 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - m->tsc = rdtsc(); /* We hope get_seconds stays lockless */ m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; @@ -144,84 +131,38 @@ void mce_setup(struct mce *m) DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -static struct mce_log mcelog = { - .signature = MCE_LOG_SIGNATURE, - .len = MCE_LOG_LEN, - .recordlen = sizeof(struct mce), -}; - -void mce_log(struct mce *mce) +void mce_log(struct mce *m) { - unsigned next, entry; - - /* Emit the trace record: */ - trace_mce_record(mce); - - if (!mce_gen_pool_add(mce)) + if (!mce_gen_pool_add(m)) irq_work_queue(&mce_irq_work); - - wmb(); - for (;;) { - entry = mce_log_get_idx_check(mcelog.next); - for (;;) { - - /* - * When the buffer fills up discard new entries. - * Assume that the earlier errors are the more - * interesting ones: - */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, - (unsigned long *)&mcelog.flags); - return; - } - /* Old left over entry. Skip: */ - if (mcelog.entry[entry].finished) { - entry++; - continue; - } - break; - } - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; - } - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - wmb(); - mcelog.entry[entry].finished = 1; - wmb(); - - set_bit(0, &mce_need_notify); } void mce_inject_log(struct mce *m) { - mutex_lock(&mce_chrdev_read_mutex); + mutex_lock(&mce_log_mutex); mce_log(m); - mutex_unlock(&mce_chrdev_read_mutex); + mutex_unlock(&mce_log_mutex); } EXPORT_SYMBOL_GPL(mce_inject_log); static struct notifier_block mce_srao_nb; +/* + * We run the default notifier if we have only the SRAO, the first and the + * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS + * notifiers registered on the chain. + */ +#define NUM_DEFAULT_NOTIFIERS 3 static atomic_t num_notifiers; void mce_register_decode_chain(struct notifier_block *nb) { - atomic_inc(&num_notifiers); + if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) + return; - /* Ensure SRAO notifier has the highest priority in the decode chain. */ - if (nb != &mce_srao_nb && nb->priority == INT_MAX) - nb->priority -= 1; + atomic_inc(&num_notifiers); - atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); @@ -229,7 +170,7 @@ void mce_unregister_decode_chain(struct notifier_block *nb) { atomic_dec(&num_notifiers); - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -322,18 +263,7 @@ static void __print_mce(struct mce *m) static void print_mce(struct mce *m) { - int ret = 0; - __print_mce(m); - - /* - * Print out human-readable details about the MCE error, - * (if the CPU has an implementation for that) - */ - ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); - if (ret == NOTIFY_STOP) - return; - pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } @@ -522,7 +452,6 @@ static void mce_schedule_work(void) static void mce_irq_work_cb(struct irq_work *entry) { - mce_notify_irq(); mce_schedule_work(); } @@ -551,20 +480,96 @@ static void mce_report_event(struct pt_regs *regs) */ static int mce_usable_address(struct mce *m) { - if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) + if (!(m->status & MCI_STATUS_ADDRV)) return 0; /* Checks after this one are Intel-specific: */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 1; + if (!(m->status & MCI_STATUS_MISCV)) + return 0; + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) return 0; + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) return 0; + return 1; } +bool mce_is_memory_error(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD) { + /* ErrCodeExt[20:16] */ + u8 xec = (m->status >> 16) & 0x1f; + + return (xec == 0x0 || xec == 0x8); + } else if (m->cpuvendor == X86_VENDOR_INTEL) { + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + } + + return false; +} +EXPORT_SYMBOL_GPL(mce_is_memory_error); + +static bool cec_add_mce(struct mce *m) +{ + if (!m) + return false; + + /* We eat only correctable DRAM errors with usable addresses. */ + if (mce_is_memory_error(m) && + !(m->status & MCI_STATUS_UC) && + mce_usable_address(m)) + if (!cec_add_elem(m->addr >> PAGE_SHIFT)) + return true; + + return false; +} + +static int mce_first_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + if (cec_add_mce(m)) + return NOTIFY_STOP; + + /* Emit the trace record: */ + trace_mce_record(m); + + set_bit(0, &mce_need_notify); + + mce_notify_irq(); + + return NOTIFY_DONE; +} + +static struct notifier_block first_nb = { + .notifier_call = mce_first_notifier, + .priority = MCE_PRIO_FIRST, +}; + static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -583,7 +588,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, } static struct notifier_block mce_srao_nb = { .notifier_call = srao_decode_notifier, - .priority = INT_MAX, + .priority = MCE_PRIO_SRAO, }; static int mce_default_notifier(struct notifier_block *nb, unsigned long val, @@ -594,11 +599,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - /* - * Run the default notifier if we have only the SRAO - * notifier and us registered. - */ - if (atomic_read(&num_notifiers) > 2) + if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) return NOTIFY_DONE; __print_mce(m); @@ -609,7 +610,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, static struct notifier_block mce_default_nb = { .notifier_call = mce_default_notifier, /* lowest prio, we want it to run last. */ - .priority = 0, + .priority = MCE_PRIO_LOWEST, }; /* @@ -651,37 +652,6 @@ static void mce_read_aux(struct mce *m, int i) } } -static bool memory_error(struct mce *m) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86_vendor == X86_VENDOR_AMD) { - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; - - return (xec == 0x0 || xec == 0x8); - } else if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * Intel SDM Volume 3B - 15.9.2 Compound Error Codes - * - * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for - * indicating a memory error. Bit 8 is used for indicating a - * cache hierarchy error. The combination of bit 2 and bit 3 - * is used for indicating a `generic' cache hierarchy error - * But we can't just blindly check the above bits, because if - * bit 11 is set, then it is a bus/interconnect error - and - * either way the above bits just gives more detail on what - * bus/interconnect error happened. Note that bit 12 can be - * ignored, as it's the "filter" bit. - */ - return (m->status & 0xef80) == BIT(7) || - (m->status & 0xef00) == BIT(8) || - (m->status & 0xeffc) == 0xc; - } - - return false; -} - DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -703,21 +673,14 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { bool error_seen = false; struct mce m; - int severity; int i; this_cpu_inc(mce_poll_count); mce_gather_info(&m, NULL); - /* - * m.tsc was set in mce_setup(). Clear it if not requested. - * - * FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid - * that dance. - */ - if (!(flags & MCP_TIMESTAMP)) - m.tsc = 0; + if (flags & MCP_TIMESTAMP) + m.tsc = rdtsc(); for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) @@ -746,11 +709,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_read_aux(&m, i); - severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); - - if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) - if (m.status & MCI_STATUS_ADDRV) - m.severity = severity; + m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); /* * Don't get the IP here because it's unlikely to @@ -1136,9 +1095,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) * on Intel. */ int lmce = 1; + int cpu = smp_processor_id(); - /* If this CPU is offline, just bail out. */ - if (cpu_is_offline(smp_processor_id())) { + /* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ + if (cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); @@ -1156,6 +1128,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) goto out; mce_gather_info(&m, regs); + m.tsc = rdtsc(); final = this_cpu_ptr(&mces_seen); *final = m; @@ -1322,41 +1295,6 @@ int memory_failure(unsigned long pfn, int vector, int flags) #endif /* - * Action optional processing happens here (picking up - * from the list of faulting pages that do_machine_check() - * placed into the genpool). - */ -static void mce_process_work(struct work_struct *dummy) -{ - mce_gen_pool_process(); -} - -#ifdef CONFIG_X86_MCE_INTEL -/*** - * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog - * @cpu: The CPU on which the event occurred. - * @status: Event status information - * - * This function should be called by the thermal interrupt after the - * event has been processed and the decision was made to log the event - * further. - * - * The status parameter will be saved to the 'status' field of 'struct mce' - * and historically has been the register value of the - * MSR_IA32_THERMAL_STATUS (Intel) msr. - */ -void mce_log_therm_throt_event(__u64 status) -{ - struct mce m; - - mce_setup(&m); - m.bank = MCE_THERMAL_BANK; - m.status = status; - mce_log(&m); -} -#endif /* CONFIG_X86_MCE_INTEL */ - -/* * Periodic polling timer for "silent" machine check errors. If the * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). @@ -1442,13 +1380,6 @@ static void mce_timer_delete_all(void) del_timer_sync(&per_cpu(mce_timer, cpu)); } -static void mce_do_trigger(struct work_struct *work) -{ - call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); -} - -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - /* * Notify the user(s) about new machine check events. * Can be called from interrupt context, but not from machine check/NMI @@ -1460,11 +1391,7 @@ int mce_notify_irq(void) static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); if (test_and_clear_bit(0, &mce_need_notify)) { - /* wake processes polling /dev/mcelog */ - wake_up_interruptible(&mce_chrdev_wait); - - if (mce_helper[0]) - schedule_work(&mce_trigger_work); + mce_work_trigger(); if (__ratelimit(&ratelimit)) pr_info(HW_ERR "Machine check events logged\n"); @@ -1618,7 +1545,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } - if (c->x86 < 17 && cfg->bootlog < 0) { + if (c->x86 < 0x11 && cfg->bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log: @@ -1731,30 +1658,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return 0; } -static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +/* + * Init basic CPU features needed for early decoding of MCEs. + */ +static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) { - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; - break; - - case X86_VENDOR_AMD: { + if (c->x86_vendor == X86_VENDOR_AMD) { mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - /* - * Install proper ops for Scalable MCA enabled processors - */ if (mce_flags.smca) { msr_ops.ctl = smca_ctl_reg; msr_ops.status = smca_status_reg; msr_ops.addr = smca_addr_reg; msr_ops.misc = smca_misc_reg; } - mce_amd_feature_init(c); + } +} +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + mce_adjust_timer = cmci_intel_adjust_timer; + break; + + case X86_VENDOR_AMD: { + mce_amd_feature_init(c); break; } @@ -1841,6 +1773,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) machine_check_vector = do_machine_check; + __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); @@ -1866,252 +1799,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c) } -/* - * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_chrdev_state_lock); -static int mce_chrdev_open_count; /* #times opened */ -static int mce_chrdev_open_exclu; /* already open exclusive? */ - -static int mce_chrdev_open(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - if (mce_chrdev_open_exclu || - (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_chrdev_state_lock); - - return -EBUSY; - } - - if (file->f_flags & O_EXCL) - mce_chrdev_open_exclu = 1; - mce_chrdev_open_count++; - - spin_unlock(&mce_chrdev_state_lock); - - return nonseekable_open(inode, file); -} - -static int mce_chrdev_release(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - mce_chrdev_open_count--; - mce_chrdev_open_exclu = 0; - - spin_unlock(&mce_chrdev_state_lock); - - return 0; -} - -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - - cpu_tsc[smp_processor_id()] = rdtsc(); -} - -static int mce_apei_read_done; - -/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ -static int __mce_read_apei(char __user **ubuf, size_t usize) -{ - int rc; - u64 record_id; - struct mce m; - - if (usize < sizeof(struct mce)) - return -EINVAL; - - rc = apei_read_mce(&m, &record_id); - /* Error or no more MCE record */ - if (rc <= 0) { - mce_apei_read_done = 1; - /* - * When ERST is disabled, mce_chrdev_read() should return - * "no record" instead of "no device." - */ - if (rc == -ENODEV) - return 0; - return rc; - } - rc = -EFAULT; - if (copy_to_user(*ubuf, &m, sizeof(struct mce))) - return rc; - /* - * In fact, we should have cleared the record after that has - * been flushed to the disk or sent to network in - * /sbin/mcelog, but we have no interface to support that now, - * so just clear it to avoid duplication. - */ - rc = apei_clear_mce(record_id); - if (rc) { - mce_apei_read_done = 1; - return rc; - } - *ubuf += sizeof(struct mce); - - return 0; -} - -static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, - size_t usize, loff_t *off) -{ - char __user *buf = ubuf; - unsigned long *cpu_tsc; - unsigned prev, next; - int i, err; - - cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - - mutex_lock(&mce_chrdev_read_mutex); - - if (!mce_apei_read_done) { - err = __mce_read_apei(&buf, usize); - if (err || buf != ubuf) - goto out; - } - - next = mce_log_get_idx_check(mcelog.next); - - /* Only supports full reads right now */ - err = -EINVAL; - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) - goto out; - - err = 0; - prev = 0; - do { - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - struct mce *m = &mcelog.entry[i]; - - while (!m->finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(m, 0, sizeof(*m)); - goto timeout; - } - cpu_relax(); - } - smp_rmb(); - err |= copy_to_user(buf, m, sizeof(*m)); - buf += sizeof(*m); -timeout: - ; - } - - memset(mcelog.entry + prev, 0, - (next - prev) * sizeof(struct mce)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); - - synchronize_sched(); - - /* - * Collect entries that were still getting written before the - * synchronize. - */ - on_each_cpu(collect_tscs, cpu_tsc, 1); - - for (i = next; i < MCE_LOG_LEN; i++) { - struct mce *m = &mcelog.entry[i]; - - if (m->finished && m->tsc < cpu_tsc[m->cpu]) { - err |= copy_to_user(buf, m, sizeof(*m)); - smp_rmb(); - buf += sizeof(*m); - memset(m, 0, sizeof(*m)); - } - } - - if (err) - err = -EFAULT; - -out: - mutex_unlock(&mce_chrdev_read_mutex); - kfree(cpu_tsc); - - return err ? err : buf - ubuf; -} - -static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &mce_chrdev_wait, wait); - if (READ_ONCE(mcelog.next)) - return POLLIN | POLLRDNORM; - if (!mce_apei_read_done && apei_check_mce()) - return POLLIN | POLLRDNORM; - return 0; -} - -static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, - unsigned long arg) -{ - int __user *p = (int __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off); - -void register_mce_write_callback(ssize_t (*fn)(struct file *filp, - const char __user *ubuf, - size_t usize, loff_t *off)) -{ - mce_write = fn; -} -EXPORT_SYMBOL_GPL(register_mce_write_callback); - -static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) -{ - if (mce_write) - return mce_write(filp, ubuf, usize, off); - else - return -EINVAL; -} - -static const struct file_operations mce_chrdev_ops = { - .open = mce_chrdev_open, - .release = mce_chrdev_release, - .read = mce_chrdev_read, - .write = mce_chrdev_write, - .poll = mce_chrdev_poll, - .unlocked_ioctl = mce_chrdev_ioctl, - .llseek = no_llseek, -}; - -static struct miscdevice mce_chrdev_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - static void __mce_disable_bank(void *arg) { int bank = *((int *)arg); @@ -2140,7 +1827,8 @@ void mce_disable_bank(int bank) * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * monarchtimeout is how long to wait for other CPUs on machine * check, or 0 to not wait - * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h + and older. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold * mce=recovery force enable memcpy_mcsafe() @@ -2185,11 +1873,12 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); + mce_register_decode_chain(&first_nb); mce_register_decode_chain(&mce_srao_nb); mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); - INIT_WORK(&mce_work, mce_process_work); + INIT_WORK(&mce_work, mce_gen_pool_process); init_irq_work(&mce_irq_work, mce_irq_work_cb); return 0; @@ -2219,12 +1908,13 @@ static void mce_disable_error_reporting(void) static void vendor_disable_error_reporting(void) { /* - * Don't clear on Intel CPUs. Some of these MSRs are socket-wide. + * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide. * Disabling them for just a single offlined CPU is bad, since it will * inhibit reporting for all shared resources on the socket like the * last level cache (LLC), the integrated memory controller (iMC), etc. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || + boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return; mce_disable_error_reporting(); @@ -2329,29 +2019,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return size; } -static ssize_t -show_trigger(struct device *s, struct device_attribute *attr, char *buf) -{ - strcpy(buf, mce_helper); - strcat(buf, "\n"); - return strlen(mce_helper) + 1; -} - -static ssize_t set_trigger(struct device *s, struct device_attribute *attr, - const char *buf, size_t siz) -{ - char *p; - - strncpy(mce_helper, buf, sizeof(mce_helper)); - mce_helper[sizeof(mce_helper)-1] = 0; - p = strchr(mce_helper, '\n'); - - if (p) - *p = 0; - - return strlen(mce_helper) + !!p; -} - static ssize_t set_ignore_ce(struct device *s, struct device_attribute *attr, const char *buf, size_t size) @@ -2408,7 +2075,6 @@ static ssize_t store_int_with_restart(struct device *s, return ret; } -static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); @@ -2431,7 +2097,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { static struct device_attribute *mce_device_attrs[] = { &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, +#ifdef CONFIG_X86_MCELOG_LEGACY &dev_attr_trigger, +#endif &dev_attr_monarch_timeout.attr, &dev_attr_dont_log_ce.attr, &dev_attr_ignore_ce.attr, @@ -2605,7 +2273,6 @@ static __init void mce_init_banks(void) static __init int mcheck_init_device(void) { - enum cpuhp_state hp_online; int err; if (!mce_available(&boot_cpu_data)) { @@ -2633,21 +2300,11 @@ static __init int mcheck_init_device(void) mce_cpu_online, mce_cpu_pre_down); if (err < 0) goto err_out_online; - hp_online = err; register_syscore_ops(&mce_syscore_ops); - /* register character device /dev/mcelog */ - err = misc_register(&mce_chrdev_device); - if (err) - goto err_register; - return 0; -err_register: - unregister_syscore_ops(&mce_syscore_ops); - cpuhp_remove_state(hp_online); - err_out_online: cpuhp_remove_state(CPUHP_X86_MCE_DEAD); @@ -2655,7 +2312,7 @@ err_out_mem: free_cpumask_var(mce_device_initialized); err_out: - pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + pr_err("Unable to init MCE device (rc: %d)\n", err); return err; } @@ -2734,6 +2391,7 @@ static int __init mcheck_late_init(void) static_branch_inc(&mcsafe_key); mcheck_debugfs_init(); + cec_init(); /* * Flush out everything that has been logged during early boot, now that diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index a5fd137417a2..9e314bcf67cc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -60,7 +60,7 @@ static const char * const th_names[] = { "load_store", "insn_fetch", "combined_unit", - "", + "decode_unit", "northbridge", "execution_unit", }; @@ -164,17 +164,48 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void get_smca_bank_info(unsigned int bank) +static void smca_configure(unsigned int bank, unsigned int cpu) { - unsigned int i, hwid_mcatype, cpu = smp_processor_id(); + unsigned int i, hwid_mcatype; struct smca_hwid *s_hwid; - u32 high, instance_id; + u32 high, low; + u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); + + /* Set appropriate bits in MCA_CONFIG */ + if (!rdmsr_safe(smca_config, &low, &high)) { + /* + * OS is required to set the MCAX bit to acknowledge that it is + * now using the new MSR ranges and new registers under each + * bank. It also means that the OS will configure deferred + * errors in the new MCx_CONFIG register. If the bit is not set, + * uncorrectable errors will cause a system panic. + * + * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) + */ + high |= BIT(0); + + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((low & BIT(5)) && !((high >> 5) & 0x3)) + high |= BIT(5); + + wrmsr(smca_config, low, high); + } /* Collect bank_info using CPU 0 for now. */ if (cpu) return; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) { + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; } @@ -191,7 +222,8 @@ static void get_smca_bank_info(unsigned int bank) smca_get_name(s_hwid->bank_type)); smca_banks[bank].hwid = s_hwid; - smca_banks[bank].id = instance_id; + smca_banks[bank].id = low; + smca_banks[bank].sysfs_id = s_hwid->count++; break; } } @@ -432,7 +464,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, int offset, u32 misc_high) { unsigned int cpu = smp_processor_id(); - u32 smca_low, smca_high, smca_addr; + u32 smca_low, smca_high; struct threshold_block b; int new; @@ -456,51 +488,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, goto set_offset; } - smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); - - if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { - /* - * OS is required to set the MCAX bit to acknowledge that it is - * now using the new MSR ranges and new registers under each - * bank. It also means that the OS will configure deferred - * errors in the new MCx_CONFIG register. If the bit is not set, - * uncorrectable errors will cause a system panic. - * - * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) - */ - smca_high |= BIT(0); - - /* - * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR} - * registers with the option of additionally logging to - * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set. - * - * This bit is usually set by BIOS to retain the old behavior - * for OSes that don't use the new registers. Linux supports the - * new registers so let's disable that additional logging here. - * - * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high - * portion of the MSR). - */ - smca_high &= ~BIT(2); - - /* - * SMCA sets the Deferred Error Interrupt type per bank. - * - * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us - * if the DeferredIntType bit field is available. - * - * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the - * high portion of the MSR). OS should set this to 0x1 to enable - * APIC based interrupt. First, check that no interrupt has been - * set. - */ - if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3)) - smca_high |= BIT(5); - - wrmsr(smca_addr, smca_low, smca_high); - } - /* Gather LVT offset for thresholding: */ if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) goto out; @@ -529,7 +516,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) for (bank = 0; bank < mca_cfg.banks; ++bank) { if (mce_flags.smca) - get_smca_bank_info(bank); + smca_configure(bank, cpu); for (block = 0; block < NR_BLOCKS; ++block) { address = get_block_address(cpu, address, low, high, bank, block); @@ -754,36 +741,19 @@ out_err: } EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); -static void -__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) +static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - u32 msr_status = msr_ops.status(bank); - u32 msr_addr = msr_ops.addr(bank); struct mce m; - u64 status; - - WARN_ON_ONCE(deferred_err && threshold_err); - - if (deferred_err && mce_flags.smca) { - msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank); - msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank); - } - - rdmsrl(msr_status, status); - - if (!(status & MCI_STATUS_VAL)) - return; mce_setup(&m); m.status = status; - m.bank = bank; - - if (threshold_err) - m.misc = misc; + m.misc = misc; + m.bank = bank; + m.tsc = rdtsc(); if (m.status & MCI_STATUS_ADDRV) { - rdmsrl(msr_addr, m.addr); + m.addr = addr; /* * Extract [55:<lsb>] where lsb is the least significant @@ -804,8 +774,6 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) } mce_log(&m); - - wrmsrl(msr_status, 0); } static inline void __smp_deferred_error_interrupt(void) @@ -814,14 +782,14 @@ static inline void __smp_deferred_error_interrupt(void) deferred_error_int_vector(); } -asmlinkage __visible void smp_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) { entering_irq(); __smp_deferred_error_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) { entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); @@ -830,87 +798,126 @@ asmlinkage __visible void smp_trace_deferred_error_interrupt(void) exiting_ack_irq(); } -/* APIC interrupt handler for deferred errors */ -static void amd_deferred_error_interrupt(void) +/* + * Returns true if the logged error is deferred. False, otherwise. + */ +static inline bool +_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) { - unsigned int bank; - u32 msr_status; - u64 status; + u64 status, addr = 0; - for (bank = 0; bank < mca_cfg.banks; ++bank) { - msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank) - : msr_ops.status(bank); + rdmsrl(msr_stat, status); + if (!(status & MCI_STATUS_VAL)) + return false; - rdmsrl(msr_status, status); + if (status & MCI_STATUS_ADDRV) + rdmsrl(msr_addr, addr); - if (!(status & MCI_STATUS_VAL) || - !(status & MCI_STATUS_DEFERRED)) - continue; + __log_error(bank, status, addr, misc); - __log_error(bank, true, false, 0); - break; - } + wrmsrl(msr_stat, 0); + + return status & MCI_STATUS_DEFERRED; } /* - * APIC Interrupt Handler + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. */ +static void log_error_deferred(unsigned int bank) +{ + bool defrd; -/* - * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. - * the interrupt goes off when error_count reaches threshold_limit. - * the handler will simply log mcelog w/ software defined bank number. - */ + defrd = _log_error_bank(bank, msr_ops.status(bank), + msr_ops.addr(bank), 0); -static void amd_threshold_interrupt(void) + if (!mce_flags.smca) + return; + + /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ + if (defrd) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return; + } + + /* + * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check + * for a valid error. + */ + _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), + MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) { - u32 low = 0, high = 0, address = 0; - unsigned int bank, block, cpu = smp_processor_id(); - struct thresh_restart tr; + unsigned int bank; - /* assume first bank caused it */ - for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(cpu, address, low, high, bank, block); - if (!address) - break; + for (bank = 0; bank < mca_cfg.banks; ++bank) + log_error_deferred(bank); +} - if (rdmsr_safe(address, &low, &high)) - break; +static void log_error_thresholding(unsigned int bank, u64 misc) +{ + _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); +} - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } +static void log_and_reset_block(struct threshold_block *block) +{ + struct thresh_restart tr; + u32 low = 0, high = 0; - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - continue; + if (!block) + return; - /* - * Log the machine check that caused the threshold - * event. - */ - if (high & MASK_OVERFLOW_HI) - goto log; - } - } - return; + if (rdmsr_safe(block->address, &low, &high)) + return; + + if (!(high & MASK_OVERFLOW_HI)) + return; -log: - __log_error(bank, false, true, ((u64)high << 32) | low); + /* Log the MCE which caused the threshold event. */ + log_error_thresholding(block->bank, ((u64)high << 32) | low); /* Reset threshold block after logging error. */ memset(&tr, 0, sizeof(tr)); - tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; + tr.b = block; threshold_restart_bank(&tr); } /* + * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt + * goes off when error_count reaches threshold_limit. + */ +static void amd_threshold_interrupt(void) +{ + struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; + unsigned int bank, cpu = smp_processor_id(); + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + + first_block = per_cpu(threshold_banks, cpu)[bank]->blocks; + if (!first_block) + continue; + + /* + * The first block is also the head of the list. Check it first + * before iterating over the rest. + */ + log_and_reset_block(first_block); + list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + log_and_reset_block(block); + } +} + +/* * Sysfs Interface */ @@ -1064,9 +1071,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return NULL; } + if (smca_banks[bank].hwid->count == 1) + return smca_get_name(bank_type); + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "%s_%x", smca_get_name(bank_type), - smca_banks[bank].id); + smca_banks[bank].sysfs_id); return buf_mcatype; } @@ -1197,7 +1207,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; per_cpu(threshold_banks, cpu)[bank] = b; - atomic_inc(&b->cpus); + refcount_inc(&b->cpus); err = __threshold_add_blocks(b); @@ -1220,7 +1230,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; if (is_shared_bank(bank)) { - atomic_set(&b->cpus, 1); + refcount_set(&b->cpus, 1); /* nb is already initialized, see above */ if (nb) { @@ -1284,7 +1294,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) goto free_out; if (is_shared_bank(bank)) { - if (!atomic_dec_and_test(&b->cpus)) { + if (!refcount_dec_and_test(&b->cpus)) { __threshold_remove_blocks(b); per_cpu(threshold_banks, cpu)[bank] = NULL; return; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 190b3e6cef4d..e84db79ef272 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) case INTEL_FAM6_BROADWELL_XEON_D: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: + if (rdmsrl_safe(MSR_PPIN_CTL, &val)) return; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 465aca8be009..d7cc190ae457 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -6,7 +6,7 @@ * * Maintains a counter in /sys that keeps track of the number of thermal * events, such that the user knows how bad the thermal problem might be - * (since the logging to syslog and mcelog is rate limited). + * (since the logging to syslog is rate limited). * * Author: Dmitriy Zavin ([email protected]) * @@ -141,13 +141,8 @@ static struct attribute_group thermal_attr_group = { * IRQ has been acknowledged. * * It will take care of rate limiting and printing messages to the syslog. - * - * Returns: 0 : Event should NOT be further logged, i.e. still in - * "timeout" from previous log message. - * 1 : Event should be logged further, and a message has been - * printed to the syslog. */ -static int therm_throt_process(bool new_event, int event, int level) +static void therm_throt_process(bool new_event, int event, int level) { struct _thermal_state *state; unsigned int this_cpu = smp_processor_id(); @@ -162,16 +157,16 @@ static int therm_throt_process(bool new_event, int event, int level) else if (event == POWER_LIMIT_EVENT) state = &pstate->core_power_limit; else - return 0; + return; } else if (level == PACKAGE_LEVEL) { if (event == THERMAL_THROTTLING_EVENT) state = &pstate->package_throttle; else if (event == POWER_LIMIT_EVENT) state = &pstate->package_power_limit; else - return 0; + return; } else - return 0; + return; old_event = state->new_event; state->new_event = new_event; @@ -181,7 +176,7 @@ static int therm_throt_process(bool new_event, int event, int level) if (time_before64(now, state->next_check) && state->count != state->last_count) - return 0; + return; state->next_check = now + CHECK_INTERVAL; state->last_count = state->count; @@ -193,16 +188,14 @@ static int therm_throt_process(bool new_event, int event, int level) this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); - return 1; + return; } if (old_event) { if (event == THERMAL_THROTTLING_EVENT) pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package"); - return 1; + return; } - - return 0; } static int thresh_event_valid(int level, int event) @@ -365,10 +358,9 @@ static void intel_thermal_interrupt(void) /* Check for violation of core thermal thresholds*/ notify_thresholds(msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - CORE_LEVEL) != 0) - mce_log_therm_throt_event(msr_val); + therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + CORE_LEVEL); if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, @@ -404,14 +396,16 @@ static inline void __smp_thermal_interrupt(void) smp_thermal_vector(); } -asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry +smp_thermal_interrupt(struct pt_regs *regs) { entering_irq(); __smp_thermal_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry +smp_trace_thermal_interrupt(struct pt_regs *regs) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 9beb092d68a5..bb0e75eed10a 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -23,14 +23,14 @@ static inline void __smp_threshold_interrupt(void) mce_threshold_vector(); } -asmlinkage __visible void smp_threshold_interrupt(void) +asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) { entering_irq(); __smp_threshold_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_threshold_interrupt(void) +asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) { entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 079e81733a58..21b185793c80 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -10,7 +10,7 @@ * Author: Peter Oruba <[email protected]> * * Based on work by: - * Tigran Aivazian <[email protected]> + * Tigran Aivazian <[email protected]> * * early loader: * Copyright (C) 2013 Advanced Micro Devices, Inc. @@ -42,16 +42,19 @@ static struct equiv_cpu_entry *equiv_cpu_table; /* * This points to the current valid container of microcode patches which we will - * save from the initrd/builtin before jettisoning its contents. + * save from the initrd/builtin before jettisoning its contents. @mc is the + * microcode patch we found to match. */ -struct container { - u8 *data; - size_t size; -} cont; +struct cont_desc { + struct microcode_amd *mc; + u32 cpuid_1_eax; + u32 psize; + u8 *data; + size_t size; +}; static u32 ucode_new_rev; static u8 amd_ucode_patch[PATCH_MAX_SIZE]; -static u16 this_equiv_id; /* * Microcode patch container file is prepended to the initrd in cpio @@ -60,57 +63,13 @@ static u16 this_equiv_id; static const char ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; -static size_t compute_container_size(u8 *data, u32 total_size) +static u16 find_equiv_id(struct equiv_cpu_entry *equiv_table, u32 sig) { - size_t size = 0; - u32 *header = (u32 *)data; - - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return size; - - size = header[2] + CONTAINER_HDR_SZ; - total_size -= size; - data += size; - - while (total_size) { - u16 patch_size; - - header = (u32 *)data; - - if (header[0] != UCODE_UCODE_TYPE) - break; - - /* - * Sanity-check patch size. - */ - patch_size = header[1]; - if (patch_size > PATCH_MAX_SIZE) - break; - - size += patch_size + SECTION_HDR_SIZE; - data += patch_size + SECTION_HDR_SIZE; - total_size -= patch_size + SECTION_HDR_SIZE; + for (; equiv_table && equiv_table->installed_cpu; equiv_table++) { + if (sig == equiv_table->installed_cpu) + return equiv_table->equiv_cpu; } - return size; -} - -static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, - unsigned int sig) -{ - int i = 0; - - if (!equiv_cpu_table) - return 0; - - while (equiv_cpu_table[i].installed_cpu != 0) { - if (sig == equiv_cpu_table[i].installed_cpu) - return equiv_cpu_table[i].equiv_cpu; - - i++; - } return 0; } @@ -118,91 +77,109 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, * This scans the ucode blob for the proper container as we can have multiple * containers glued together. Returns the equivalence ID from the equivalence * table or 0 if none found. + * Returns the amount of bytes consumed while scanning. @desc contains all the + * data we're going to use in later stages of the application. */ -static u16 -find_proper_container(u8 *ucode, size_t size, struct container *ret_cont) +static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc) { - struct container ret = { NULL, 0 }; - u32 eax, ebx, ecx, edx; struct equiv_cpu_entry *eq; - int offset, left; - u16 eq_id = 0; - u32 *header; - u8 *data; + ssize_t orig_size = size; + u32 *hdr = (u32 *)ucode; + u16 eq_id; + u8 *buf; - data = ucode; - left = size; - header = (u32 *)data; + /* Am I looking at an equivalence table header? */ + if (hdr[0] != UCODE_MAGIC || + hdr[1] != UCODE_EQUIV_CPU_TABLE_TYPE || + hdr[2] == 0) + return CONTAINER_HDR_SZ; + buf = ucode; - /* find equiv cpu table */ - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return eq_id; + eq = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ); - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); + /* Find the equivalence ID of our CPU in this table: */ + eq_id = find_equiv_id(eq, desc->cpuid_1_eax); - while (left > 0) { - eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); + buf += hdr[2] + CONTAINER_HDR_SZ; + size -= hdr[2] + CONTAINER_HDR_SZ; - ret.data = data; + /* + * Scan through the rest of the container to find where it ends. We do + * some basic sanity-checking too. + */ + while (size > 0) { + struct microcode_amd *mc; + u32 patch_size; - /* Advance past the container header */ - offset = header[2] + CONTAINER_HDR_SZ; - data += offset; - left -= offset; + hdr = (u32 *)buf; + + if (hdr[0] != UCODE_UCODE_TYPE) + break; - eq_id = find_equiv_id(eq, eax); - if (eq_id) { - ret.size = compute_container_size(ret.data, left + offset); + /* Sanity-check patch size. */ + patch_size = hdr[1]; + if (patch_size > PATCH_MAX_SIZE) + break; - /* - * truncate how much we need to iterate over in the - * ucode update loop below - */ - left = ret.size - offset; + /* Skip patch section header: */ + buf += SECTION_HDR_SIZE; + size -= SECTION_HDR_SIZE; - *ret_cont = ret; - return eq_id; + mc = (struct microcode_amd *)buf; + if (eq_id == mc->hdr.processor_rev_id) { + desc->psize = patch_size; + desc->mc = mc; } - /* - * support multiple container files appended together. if this - * one does not have a matching equivalent cpu entry, we fast - * forward to the next container file. - */ - while (left > 0) { - header = (u32 *)data; - - if (header[0] == UCODE_MAGIC && - header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) - break; - - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; - } + buf += patch_size; + size -= patch_size; + } - /* mark where the next microcode container file starts */ - offset = data - (u8 *)ucode; - ucode = data; + /* + * If we have found a patch (desc->mc), it means we're looking at the + * container which has a patch for this CPU so return 0 to mean, @ucode + * already points to the proper container. Otherwise, we return the size + * we scanned so that we can advance to the next container in the + * buffer. + */ + if (desc->mc) { + desc->data = ucode; + desc->size = orig_size - size; + + return 0; } - return eq_id; + return orig_size - size; } -static int __apply_microcode_amd(struct microcode_amd *mc_amd) +/* + * Scan the ucode blob for the proper container as we can have multiple + * containers glued together. + */ +static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc) +{ + ssize_t rem = size; + + while (rem >= 0) { + ssize_t s = parse_container(ucode, rem, desc); + if (!s) + return; + + ucode += s; + rem -= s; + } +} + +static int __apply_microcode_amd(struct microcode_amd *mc) { u32 rev, dummy; - native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code); /* verify patch application was successful */ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev != mc_amd->hdr.patch_id) + if (rev != mc->hdr.patch_id) return -1; return 0; @@ -217,17 +194,16 @@ static int __apply_microcode_amd(struct microcode_amd *mc_amd) * load_microcode_amd() to save equivalent cpu table and microcode patches in * kernel heap memory. * - * Returns true if container found (sets @ret_cont), false otherwise. + * Returns true if container found (sets @desc), false otherwise. */ -static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch, - struct container *ret_cont) +static bool +apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) { + struct cont_desc desc = { 0 }; u8 (*patch)[PATCH_MAX_SIZE]; - u32 rev, *header, *new_rev; - struct container ret; - int offset, left; - u16 eq_id = 0; - u8 *data; + struct microcode_amd *mc; + u32 rev, dummy, *new_rev; + bool ret = false; #ifdef CONFIG_X86_32 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); @@ -237,50 +213,27 @@ static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch, patch = &amd_ucode_patch; #endif - if (check_current_patch_level(&rev, true)) - return false; - - eq_id = find_proper_container(ucode, size, &ret); - if (!eq_id) - return false; - - this_equiv_id = eq_id; - header = (u32 *)ret.data; + desc.cpuid_1_eax = cpuid_1_eax; - /* We're pointing to an equiv table, skip over it. */ - data = ret.data + header[2] + CONTAINER_HDR_SZ; - left = ret.size - (header[2] + CONTAINER_HDR_SZ); - - while (left > 0) { - struct microcode_amd *mc; + scan_containers(ucode, size, &desc); - header = (u32 *)data; - if (header[0] != UCODE_UCODE_TYPE || /* type */ - header[1] == 0) /* size */ - break; - - mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); - - if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { + mc = desc.mc; + if (!mc) + return ret; - if (!__apply_microcode_amd(mc)) { - rev = mc->hdr.patch_id; - *new_rev = rev; + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev >= mc->hdr.patch_id) + return ret; - if (save_patch) - memcpy(patch, mc, min_t(u32, header[1], PATCH_MAX_SIZE)); - } - } + if (!__apply_microcode_amd(mc)) { + *new_rev = mc->hdr.patch_id; + ret = true; - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; + if (save_patch) + memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE)); } - if (ret_cont) - *ret_cont = ret; - - return true; + return ret; } static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) @@ -298,10 +251,9 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) #endif } -void __init load_ucode_amd_bsp(unsigned int family) +static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret) { struct ucode_cpu_info *uci; - u32 eax, ebx, ecx, edx; struct cpio_data cp; const char *path; bool use_pa; @@ -316,183 +268,91 @@ void __init load_ucode_amd_bsp(unsigned int family) use_pa = false; } - if (!get_builtin_microcode(&cp, family)) + if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) cp = find_microcode_in_initrd(path, use_pa); - if (!(cp.data && cp.size)) - return; - - /* Get BSP's CPUID.EAX(1), needed in load_microcode_amd() */ - eax = 1; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - uci->cpu_sig.sig = eax; + /* Needed in load_microcode_amd() */ + uci->cpu_sig.sig = cpuid_1_eax; - apply_microcode_early_amd(cp.data, cp.size, true, NULL); + *ret = cp; } -#ifdef CONFIG_X86_32 -/* - * On 32-bit, since AP's early load occurs before paging is turned on, we - * cannot traverse cpu_equiv_table and microcode_cache in kernel heap memory. - * So during cold boot, AP will apply_ucode_in_initrd() just like the BSP. - * In save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, - * which is used upon resume from suspend. - */ -void load_ucode_amd_ap(unsigned int family) +void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) { - struct microcode_amd *mc; - struct cpio_data cp; - - mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); - if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { - __apply_microcode_amd(mc); - return; - } - - if (!get_builtin_microcode(&cp, family)) - cp = find_microcode_in_initrd((const char *)__pa_nodebug(ucode_path), true); + struct cpio_data cp = { }; + __load_ucode_amd(cpuid_1_eax, &cp); if (!(cp.data && cp.size)) return; - /* - * This would set amd_ucode_patch above so that the following APs can - * use it directly instead of going down this path again. - */ - apply_microcode_early_amd(cp.data, cp.size, true, NULL); + apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true); } -#else -void load_ucode_amd_ap(unsigned int family) + +void load_ucode_amd_ap(unsigned int cpuid_1_eax) { - struct equiv_cpu_entry *eq; struct microcode_amd *mc; - u32 rev, eax; - u16 eq_id; - - /* 64-bit runs with paging enabled, thus early==false. */ - if (check_current_patch_level(&rev, false)) - return; - - /* First AP hasn't cached it yet, go through the blob. */ - if (!cont.data) { - struct cpio_data cp = { NULL, 0, "" }; + struct cpio_data cp; + u32 *new_rev, rev, dummy; - if (cont.size == -1) - return; + if (IS_ENABLED(CONFIG_X86_32)) { + mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + } else { + mc = (struct microcode_amd *)amd_ucode_patch; + new_rev = &ucode_new_rev; + } -reget: - if (!get_builtin_microcode(&cp, family)) { -#ifdef CONFIG_BLK_DEV_INITRD - if (!initrd_gone) - cp = find_cpio_data(ucode_path, (void *)initrd_start, - initrd_end - initrd_start, NULL); -#endif - if (!(cp.data && cp.size)) { - /* - * Mark it so that other APs do not scan again - * for no real reason and slow down boot - * needlessly. - */ - cont.size = -1; - return; - } - } + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (!apply_microcode_early_amd(cp.data, cp.size, false, &cont)) { - cont.size = -1; + /* Check whether we have saved a new patch already: */ + if (*new_rev && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + *new_rev = mc->hdr.patch_id; return; } } - eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(cont.data + CONTAINER_HDR_SZ); - - eq_id = find_equiv_id(eq, eax); - if (!eq_id) + __load_ucode_amd(cpuid_1_eax, &cp); + if (!(cp.data && cp.size)) return; - if (eq_id == this_equiv_id) { - mc = (struct microcode_amd *)amd_ucode_patch; - - if (mc && rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) - ucode_new_rev = mc->hdr.patch_id; - } - - } else { - - /* - * AP has a different equivalence ID than BSP, looks like - * mixed-steppings silicon so go through the ucode blob anew. - */ - goto reget; - } + apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false); } -#endif /* CONFIG_X86_32 */ static enum ucode_state -load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); +load_microcode_amd(bool save, u8 family, const u8 *data, size_t size); -int __init save_microcode_in_initrd_amd(unsigned int fam) +int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) { + struct cont_desc desc = { 0 }; enum ucode_state ret; - int retval = 0; - u16 eq_id; - - if (!cont.data) { - if (IS_ENABLED(CONFIG_X86_32) && (cont.size != -1)) { - struct cpio_data cp = { NULL, 0, "" }; - -#ifdef CONFIG_BLK_DEV_INITRD - cp = find_cpio_data(ucode_path, (void *)initrd_start, - initrd_end - initrd_start, NULL); -#endif + struct cpio_data cp; - if (!(cp.data && cp.size)) { - cont.size = -1; - return -EINVAL; - } + cp = find_microcode_in_initrd(ucode_path, false); + if (!(cp.data && cp.size)) + return -EINVAL; - eq_id = find_proper_container(cp.data, cp.size, &cont); - if (!eq_id) { - cont.size = -1; - return -EINVAL; - } + desc.cpuid_1_eax = cpuid_1_eax; - } else - return -EINVAL; - } + scan_containers(cp.data, cp.size, &desc); + if (!desc.mc) + return -EINVAL; - ret = load_microcode_amd(smp_processor_id(), fam, cont.data, cont.size); + ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size); if (ret != UCODE_OK) - retval = -EINVAL; - - /* - * This will be freed any msec now, stash patches for the current - * family and switch to patch cache for cpu hotplug, etc later. - */ - cont.data = NULL; - cont.size = 0; + return -EINVAL; - return retval; + return 0; } void reload_ucode_amd(void) { struct microcode_amd *mc; - u32 rev; - - /* - * early==false because this is a syscore ->resume path and by - * that time paging is long enabled. - */ - if (check_current_patch_level(&rev, false)) - return; + u32 rev, dummy; mc = (struct microcode_amd *)amd_ucode_patch; - if (!mc) - return; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); if (rev < mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { @@ -631,60 +491,13 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, return patch_size; } -/* - * Those patch levels cannot be updated to newer ones and thus should be final. - */ -static u32 final_levels[] = { - 0x01000098, - 0x0100009f, - 0x010000af, - 0, /* T-101 terminator */ -}; - -/* - * Check the current patch level on this CPU. - * - * @rev: Use it to return the patch level. It is set to 0 in the case of - * error. - * - * Returns: - * - true: if update should stop - * - false: otherwise - */ -bool check_current_patch_level(u32 *rev, bool early) -{ - u32 lvl, dummy, i; - bool ret = false; - u32 *levels; - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); - - if (IS_ENABLED(CONFIG_X86_32) && early) - levels = (u32 *)__pa_nodebug(&final_levels); - else - levels = final_levels; - - for (i = 0; levels[i]; i++) { - if (lvl == levels[i]) { - lvl = 0; - ret = true; - break; - } - } - - if (rev) - *rev = lvl; - - return ret; -} - static int apply_microcode_amd(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_amd *mc_amd; struct ucode_cpu_info *uci; struct ucode_patch *p; - u32 rev; + u32 rev, dummy; BUG_ON(raw_smp_processor_id() != cpu); @@ -697,8 +510,7 @@ static int apply_microcode_amd(int cpu) mc_amd = p->data; uci->mc = p->data; - if (check_current_patch_level(&rev, false)) - return -1; + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); /* need to apply patch? */ if (rev >= mc_amd->hdr.patch_id) { @@ -862,7 +674,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, } static enum ucode_state -load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) +load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) { enum ucode_state ret; @@ -876,8 +688,8 @@ load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) #ifdef CONFIG_X86_32 /* save BSP's matching patch for early load */ - if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { - struct ucode_patch *p = find_patch(cpu); + if (save) { + struct ucode_patch *p = find_patch(0); if (p) { memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), @@ -909,11 +721,12 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, { char fw_name[36] = "amd-ucode/microcode_amd.bin"; struct cpuinfo_x86 *c = &cpu_data(cpu); + bool bsp = c->cpu_index == boot_cpu_data.cpu_index; enum ucode_state ret = UCODE_NFOUND; const struct firmware *fw; /* reload ucode container only on the boot cpu */ - if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index) + if (!refresh_fw || !bsp) return UCODE_OK; if (c->x86 >= 0x15) @@ -930,7 +743,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size); + ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 73102d932760..9cb98ee103db 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -1,7 +1,7 @@ /* * CPU Microcode Update Driver for Linux * - * Copyright (C) 2000-2006 Tigran Aivazian <[email protected]> + * Copyright (C) 2000-2006 Tigran Aivazian <[email protected]> * 2006 Shaohua Li <[email protected]> * 2013-2016 Borislav Petkov <[email protected]> * @@ -66,19 +66,50 @@ static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -/* - * Operations that are run on a target cpu: - */ - struct cpu_info_ctx { struct cpu_signature *cpu_sig; int err; }; +/* + * Those patch levels cannot be updated to newer ones and thus should be final. + */ +static u32 final_levels[] = { + 0x01000098, + 0x0100009f, + 0x010000af, + 0, /* T-101 terminator */ +}; + +/* + * Check the current patch level on this CPU. + * + * Returns: + * - true: if update should stop + * - false: otherwise + */ +static bool amd_check_current_patch_level(void) +{ + u32 lvl, dummy, i; + u32 *levels; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); + + if (IS_ENABLED(CONFIG_X86_32)) + levels = (u32 *)__pa_nodebug(&final_levels); + else + levels = final_levels; + + for (i = 0; levels[i]; i++) { + if (lvl == levels[i]) + return true; + } + return false; +} + static bool __init check_loader_disabled_bsp(void) { static const char *__dis_opt_str = "dis_ucode_ldr"; - u32 a, b, c, d; #ifdef CONFIG_X86_32 const char *cmdline = (const char *)__pa_nodebug(boot_command_line); @@ -94,18 +125,19 @@ static bool __init check_loader_disabled_bsp(void) if (!have_cpuid_p()) return *res; - a = 1; - c = 0; - native_cpuid(&a, &b, &c, &d); - /* * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not * completely accurate as xen pv guests don't see that CPUID bit set but * that's good enough as they don't land on the BSP path anyway. */ - if (c & BIT(31)) + if (native_cpuid_ecx(1) & BIT(31)) return *res; + if (x86_cpuid_vendor() == X86_VENDOR_AMD) { + if (amd_check_current_patch_level()) + return *res; + } + if (cmdline_find_option_bool(cmdline, option) <= 0) *res = false; @@ -133,23 +165,21 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name) void __init load_ucode_bsp(void) { - int vendor; - unsigned int family; + unsigned int cpuid_1_eax; if (check_loader_disabled_bsp()) return; - vendor = x86_cpuid_vendor(); - family = x86_cpuid_family(); + cpuid_1_eax = native_cpuid_eax(1); - switch (vendor) { + switch (x86_cpuid_vendor()) { case X86_VENDOR_INTEL: - if (family >= 6) + if (x86_family(cpuid_1_eax) >= 6) load_ucode_intel_bsp(); break; case X86_VENDOR_AMD: - if (family >= 0x10) - load_ucode_amd_bsp(family); + if (x86_family(cpuid_1_eax) >= 0x10) + load_ucode_amd_bsp(cpuid_1_eax); break; default: break; @@ -167,22 +197,21 @@ static bool check_loader_disabled_ap(void) void load_ucode_ap(void) { - int vendor, family; + unsigned int cpuid_1_eax; if (check_loader_disabled_ap()) return; - vendor = x86_cpuid_vendor(); - family = x86_cpuid_family(); + cpuid_1_eax = native_cpuid_eax(1); - switch (vendor) { + switch (x86_cpuid_vendor()) { case X86_VENDOR_INTEL: - if (family >= 6) + if (x86_family(cpuid_1_eax) >= 6) load_ucode_intel_ap(); break; case X86_VENDOR_AMD: - if (family >= 0x10) - load_ucode_amd_ap(family); + if (x86_family(cpuid_1_eax) >= 0x10) + load_ucode_amd_ap(cpuid_1_eax); break; default: break; @@ -201,7 +230,7 @@ static int __init save_microcode_in_initrd(void) break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - ret = save_microcode_in_initrd_amd(c->x86); + return save_microcode_in_initrd_amd(cpuid_eax(1)); break; default: break; @@ -261,6 +290,17 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) return (struct cpio_data){ NULL, 0, "" }; if (initrd_start) start = initrd_start; + } else { + /* + * The picture with physical addresses is a bit different: we + * need to get the *physical* address to which the ramdisk was + * relocated, i.e., relocated_ramdisk (not initrd_start) and + * since we're running from physical addresses, we need to access + * relocated_ramdisk through its *physical* address too. + */ + u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk); + if (*rr) + start = *rr; } return find_cpio_data(path, (void *)start, size, NULL); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 8325d8a09ab0..59edbe9d4ccb 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -1,7 +1,7 @@ /* * Intel CPU Microcode Update Driver for Linux * - * Copyright (C) 2000-2006 Tigran Aivazian <[email protected]> + * Copyright (C) 2000-2006 Tigran Aivazian <[email protected]> * 2006 Shaohua Li <[email protected]> * * Intel CPU microcode early update for Linux @@ -42,7 +42,7 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; /* Current microcode patch used in early patching on the APs. */ -struct microcode_intel *intel_ucode_patch; +static struct microcode_intel *intel_ucode_patch; static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, unsigned int s2, unsigned int p2) @@ -166,7 +166,7 @@ static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) static void save_microcode_patch(void *data, unsigned int size) { struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - struct ucode_patch *iter, *tmp, *p; + struct ucode_patch *iter, *tmp, *p = NULL; bool prev_found = false; unsigned int sig, pf; @@ -202,6 +202,18 @@ static void save_microcode_patch(void *data, unsigned int size) else list_add_tail(&p->plist, µcode_cache); } + + /* + * Save for early loading. On 32-bit, that needs to be a physical + * address as the APs are running from physical addresses, before + * paging has been enabled. + */ + if (p) { + if (IS_ENABLED(CONFIG_X86_32)) + intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); + else + intel_ucode_patch = p->data; + } } static int microcode_sanity_check(void *mc, int print_err) @@ -607,6 +619,14 @@ int __init save_microcode_in_initrd_intel(void) struct ucode_cpu_info uci; struct cpio_data cp; + /* + * initrd is going away, clear patch ptr. We will scan the microcode one + * last time before jettisoning and save a patch, if found. Then we will + * update that pointer too, with a stable patch address to use when + * resuming the cores. + */ + intel_ucode_patch = NULL; + if (!load_builtin_intel_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path, false); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 65e20c97e04b..70e717fccdd6 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -49,6 +49,9 @@ void hyperv_vector_handler(struct pt_regs *regs) if (vmbus_handler) vmbus_handler(); + if (ms_hyperv.hints & HV_X64_DEPRECATING_AEOI_RECOMMENDED) + ack_APIC_irq(); + exiting_irq(); set_irq_regs(old_regs); } @@ -133,26 +136,6 @@ static uint32_t __init ms_hyperv_platform(void) return 0; } -static u64 read_hv_clock(struct clocksource *arg) -{ - u64 current_tick; - /* - * Read the partition counter to get the current tick count. This count - * is set to 0 when the partition is created and is incremented in - * 100 nanosecond units. - */ - rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); - return current_tick; -} - -static struct clocksource hyperv_cs = { - .name = "hyperv_clocksource", - .rating = 400, /* use this when running on Hyperv*/ - .read = read_hv_clock, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - static unsigned char hv_get_nmi_reason(void) { return 0; @@ -178,8 +161,22 @@ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) } #endif +static unsigned long hv_get_tsc_khz(void) +{ + unsigned long freq; + + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + + return freq / 1000; +} + static void __init ms_hyperv_init_platform(void) { + int hv_host_info_eax; + int hv_host_info_ebx; + int hv_host_info_ecx; + int hv_host_info_edx; + /* * Extract the features and hints */ @@ -190,8 +187,30 @@ static void __init ms_hyperv_init_platform(void) pr_info("HyperV: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + /* + * Extract host information. + */ + if (cpuid_eax(HVCPUID_VENDOR_MAXFUNCTION) >= HVCPUID_VERSION) { + hv_host_info_eax = cpuid_eax(HVCPUID_VERSION); + hv_host_info_ebx = cpuid_ebx(HVCPUID_VERSION); + hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION); + hv_host_info_edx = cpuid_edx(HVCPUID_VERSION); + + pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n", + hv_host_info_eax, hv_host_info_ebx >> 16, + hv_host_info_ebx & 0xFFFF, hv_host_info_ecx, + hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF); + } + + if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { + x86_platform.calibrate_tsc = hv_get_tsc_khz; + x86_platform.calibrate_cpu = hv_get_tsc_khz; + } + #ifdef CONFIG_X86_LOCAL_APIC - if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { + if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { /* * Get the APIC frequency. */ @@ -208,9 +227,6 @@ static void __init ms_hyperv_init_platform(void) "hv_nmi_unknown"); #endif - if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) - clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); - #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif @@ -227,6 +243,13 @@ static void __init ms_hyperv_init_platform(void) */ if (efi_enabled(EFI_BOOT)) x86_platform.get_nmi_reason = hv_get_nmi_reason; + +#if IS_ENABLED(CONFIG_HYPERV) + /* + * Setup the hook to get control post apic initialization. + */ + x86_platform.apic_post_init = hyperv_init; +#endif } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 3b442b64c72d..765afd599039 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -27,7 +27,7 @@ #include <linux/range.h> #include <asm/processor.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> @@ -860,7 +860,7 @@ real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) trim_size <<= PAGE_SHIFT; trim_size -= trim_start; - return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); + return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED); } /** @@ -978,7 +978,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) WARN_ON(1); pr_info("update e820 for mtrr\n"); - update_e820(); + e820__update_table_print(); return 1; } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 24e87e74990d..c5bb63be4ba1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -48,7 +48,7 @@ #include <linux/syscore_ops.h> #include <asm/cpufeature.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/pat.h> @@ -807,10 +807,8 @@ void mtrr_save_state(void) if (!mtrr_enabled()) return; - get_online_cpus(); first_cpu = cpumask_first(cpu_online_mask); smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); - put_online_cpus(); } void set_mtrr_aps_delayed_init(void) diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 18ca99f2798b..218f79825b3c 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -2,7 +2,6 @@ #include <linux/timex.h> #include <linux/string.h> #include <linux/seq_file.h> -#include <linux/cpufreq.h> /* * Get CPU information for use by the procfs. @@ -31,14 +30,13 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu\t\t: %s\n" "fpu_exception\t: %s\n" "cpuid level\t: %d\n" - "wp\t\t: %s\n", + "wp\t\t: yes\n", static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - c->cpuid_level, - c->wp_works_ok ? "yes" : "no"); + c->cpuid_level); } #else static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) @@ -77,14 +75,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (c->microcode) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); - if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = cpufreq_quick_get(cpu); - - if (!freq) - freq = cpu_khz; + if (cpu_has(c, X86_FEATURE_TSC)) seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); - } + cpu_khz / 1000, (cpu_khz % 1000)); /* Cache size */ if (c->x86_cache_size >= 0) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index d9794060fe22..23c23508c012 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 34178564be2a..d77d07ab310b 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,4 +1,6 @@ #include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/mm.h> #include <asm/cpufeature.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 891f4dad7b2c..40ed26852ebd 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -30,7 +30,6 @@ #include <asm/hypervisor.h> #include <asm/timer.h> #include <asm/apic.h> -#include <asm/timer.h> #undef pr_fmt #define pr_fmt(fmt) "vmware: " fmt @@ -114,6 +113,24 @@ static void __init vmware_paravirt_ops_setup(void) #define vmware_paravirt_ops_setup() do {} while (0) #endif +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +static void __init vmware_set_capabilities(void) +{ + setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); +} + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; @@ -153,6 +170,8 @@ static void __init vmware_platform_setup(void) #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif + + vmware_set_capabilities(); } /* @@ -177,24 +196,6 @@ static uint32_t __init vmware_platform(void) return 0; } -/* - * VMware hypervisor takes care of exporting a reliable TSC to the guest. - * Still, due to timing difference when running on virtual cpus, the TSC can - * be marked as unstable in some cases. For example, the TSC sync check at - * bootup can fail due to a marginal offset between vcpus' TSCs (though the - * TSCs do not drift from each other). Also, the ACPI PM timer clocksource - * is not suitable as a watchdog when running on a hypervisor because the - * kernel may miss a wrap of the counter if the vcpu is descheduled for a - * long time. To skip these checks at runtime we set these capability bits, - * so that the kernel could just trust the hypervisor with providing a - * reliable virtual TSC that is suitable for timekeeping. - */ -static void vmware_set_cpu_features(struct cpuinfo_x86 *c) -{ - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); -} - /* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ static bool __init vmware_legacy_x2apic_available(void) { @@ -207,7 +208,6 @@ static bool __init vmware_legacy_x2apic_available(void) const __refconst struct hypervisor_x86 x86_hyper_vmware = { .name = "VMware", .detect = vmware_platform, - .set_cpu_features = vmware_set_cpu_features, .init_platform = vmware_platform_setup, .x2apic_available = vmware_legacy_x2apic_available, }; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 3741461c63a0..44404e2307bb 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -29,6 +29,7 @@ #include <asm/nmi.h> #include <asm/hw_irq.h> #include <asm/apic.h> +#include <asm/e820/types.h> #include <asm/io_apic.h> #include <asm/hpet.h> #include <linux/kdebug.h> @@ -456,7 +457,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced, bufp += sizeof(Elf64_Phdr); phdr->p_type = PT_NOTE; phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; (ehdr->e_phnum)++; #ifdef CONFIG_X86_64 @@ -503,16 +504,16 @@ static int prepare_elf_headers(struct kimage *image, void **addr, return ret; } -static int add_e820_entry(struct boot_params *params, struct e820entry *entry) +static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) { unsigned int nr_e820_entries; nr_e820_entries = params->e820_entries; - if (nr_e820_entries >= E820MAX) + if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) return 1; - memcpy(¶ms->e820_map[nr_e820_entries], entry, - sizeof(struct e820entry)); + memcpy(¶ms->e820_table[nr_e820_entries], entry, + sizeof(struct e820_entry)); params->e820_entries++; return 0; } @@ -521,7 +522,7 @@ static int memmap_entry_callback(u64 start, u64 end, void *arg) { struct crash_memmap_data *cmd = arg; struct boot_params *params = cmd->params; - struct e820entry ei; + struct e820_entry ei; ei.addr = start; ei.size = end - start + 1; @@ -560,7 +561,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) { int i, ret = 0; unsigned long flags; - struct e820entry ei; + struct e820_entry ei; struct crash_memmap_data cmd; struct crash_mem *cmem; @@ -574,17 +575,17 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) /* Add first 640K segment */ ei.addr = image->arch.backup_src_start; ei.size = image->arch.backup_src_sz; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); /* Add ACPI tables */ - cmd.type = E820_ACPI; + cmd.type = E820_TYPE_ACPI; flags = IORESOURCE_MEM | IORESOURCE_BUSY; walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, memmap_entry_callback); /* Add ACPI Non-volatile Storage */ - cmd.type = E820_NVS; + cmd.type = E820_TYPE_NVS; walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, memmap_entry_callback); @@ -592,7 +593,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) if (crashk_low_res.end) { ei.addr = crashk_low_res.start; ei.size = crashk_low_res.end - crashk_low_res.start + 1; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); } @@ -609,7 +610,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) if (ei.size < PAGE_SIZE) continue; ei.addr = cmem->ranges[i].start; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3fe45f84ced4..cbf1f6ba39a8 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -235,8 +235,7 @@ static void __init dtb_add_ioapic(struct device_node *dn) ret = of_address_to_resource(dn, 0, &r); if (ret) { - printk(KERN_ERR "Can't obtain address from node %s.\n", - dn->full_name); + printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn); return; } mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg); diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index b2f7207ba86c..f9c324e08d85 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -1,5 +1,6 @@ #include <linux/mm.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/init_task.h> #include <linux/fs.h> diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 0cfd01d2754c..dbce3cca94cb 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -10,6 +10,8 @@ #include <linux/kdebug.h> #include <linux/module.h> #include <linux/ptrace.h> +#include <linux/sched/debug.h> +#include <linux/sched/task_stack.h> #include <linux/ftrace.h> #include <linux/kexec.h> #include <linux/bug.h> @@ -75,7 +77,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - softirq stack * - hardirq stack */ - for (regs = NULL; stack; stack = stack_info.next_sp) { + for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; /* @@ -287,9 +289,6 @@ void die(const char *str, struct pt_regs *regs, long err) unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (!user_mode(regs)) - report_bug(regs->ip, regs); - if (__die(str, regs, err)) sig = 0; oops_end(flags, regs, sig); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index bb3b5b9a6899..e5f0b40e66d2 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -2,6 +2,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ +#include <linux/sched/debug.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> @@ -161,15 +162,3 @@ void show_regs(struct pt_regs *regs) } pr_cont("\n"); } - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (ip < PAGE_OFFSET) - return 0; - if (probe_kernel_address((unsigned short *)ip, ud2)) - return 0; - - return ud2 == 0x0b0f; -} diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index fac189efcc34..3e1471d57487 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -2,6 +2,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ +#include <linux/sched/debug.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> @@ -177,13 +178,3 @@ void show_regs(struct pt_regs *regs) } pr_cont("\n"); } - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) - return 0; - - return ud2 == 0x0b0f; -} diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 90e8dde3ec26..532da61d605c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1,49 +1,67 @@ /* - * Handle the memory map. - * The functions here do the job until bootmem takes over. + * Low level x86 E820 memory map handling functions. * - * Getting sanitize_e820_map() in sync with i386 version by applying change: - * - Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach <[email protected]>, December 2002. - * Venkatesh Pallipadi <[email protected]> + * The firmware and bootloader passes us the "E820 table", which is the primary + * physical memory layout description available about x86 systems. * + * The kernel takes the E820 memory layout and optionally modifies it with + * quirks and other tweaks, and feeds that into the generic Linux memory + * allocation code routines via a platform independent interface (memblock, etc.). */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> #include <linux/crash_dump.h> -#include <linux/export.h> #include <linux/bootmem.h> -#include <linux/pfn.h> #include <linux/suspend.h> #include <linux/acpi.h> #include <linux/firmware-map.h> #include <linux/memblock.h> #include <linux/sort.h> -#include <asm/e820.h> -#include <asm/proto.h> +#include <asm/e820/api.h> #include <asm/setup.h> -#include <asm/cpufeature.h> /* - * The e820 map is the map that gets modified e.g. with command line parameters - * and that is also registered with modifications in the kernel resource tree - * with the iomem_resource as parent. + * We organize the E820 table into three main data structures: * - * The e820_saved is directly saved after the BIOS-provided memory map is - * copied. It doesn't get modified afterwards. It's registered for the - * /sys/firmware/memmap interface. + * - 'e820_table_firmware': the original firmware version passed to us by the + * bootloader - not modified by the kernel. It is composed of two parts: + * the first 128 E820 memory entries in boot_params.e820_table and the remaining + * (if any) entries of the SETUP_E820_EXT nodes. We use this to: * - * That memory map is not modified and is used as base for kexec. The kexec'd - * kernel should get the same memory map as the firmware provides. Then the - * user can e.g. boot the original kernel with mem=1G while still booting the - * next kernel with full memory. + * - inform the user about the firmware's notion of memory layout + * via /sys/firmware/memmap + * + * - the hibernation code uses it to generate a kernel-independent MD5 + * fingerprint of the physical memory layout of a system. + * + * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version + * passed to us by the bootloader - the major difference between + * e820_table_firmware[] and this one is that, the latter marks the setup_data + * list created by the EFI boot stub as reserved, so that kexec can reuse the + * setup_data information in the second kernel. Besides, e820_table_kexec[] + * might also be modified by the kexec itself to fake a mptable. + * We use this to: + * + * - kexec, which is a bootloader in disguise, uses the original E820 + * layout to pass to the kexec-ed kernel. This way the original kernel + * can have a restricted E820 map while the kexec()-ed kexec-kernel + * can have access to full memory - etc. + * + * - 'e820_table': this is the main E820 table that is massaged by the + * low level x86 platform code, or modified by boot parameters, before + * passed on to higher level MM layers. + * + * Once the E820 map has been converted to the standard Linux memory layout + * information its role stops - modifying it has no effect and does not get + * re-propagated. So itsmain role is a temporary bootstrap storage of firmware + * specific memory layout data during early bootup. */ -static struct e820map initial_e820 __initdata; -static struct e820map initial_e820_saved __initdata; -struct e820map *e820 __refdata = &initial_e820; -struct e820map *e820_saved __refdata = &initial_e820_saved; +static struct e820_table e820_table_init __initdata; +static struct e820_table e820_table_kexec_init __initdata; +static struct e820_table e820_table_firmware_init __initdata; + +struct e820_table *e820_table __refdata = &e820_table_init; +struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init; +struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -55,51 +73,53 @@ EXPORT_SYMBOL(pci_mem_start); * This function checks if any part of the range <start,end> is mapped * with type. */ -int -e820_any_mapped(u64 start, u64 end, unsigned type) +bool e820__mapped_any(u64 start, u64 end, enum e820_type type) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (type && ei->type != type) + if (type && entry->type != type) continue; - if (ei->addr >= end || ei->addr + ei->size <= start) + if (entry->addr >= end || entry->addr + entry->size <= start) continue; return 1; } return 0; } -EXPORT_SYMBOL_GPL(e820_any_mapped); +EXPORT_SYMBOL_GPL(e820__mapped_any); /* - * This function checks if the entire range <start,end> is mapped with type. + * This function checks if the entire <start,end> range is mapped with 'type'. * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case + * Note: this function only works correctly once the E820 table is sorted and + * not-overlapping (at least for the range specified), which is the case normally. */ -int __init e820_all_mapped(u64 start, u64 end, unsigned type) +bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (type && ei->type != type) + if (type && entry->type != type) continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) + + /* Is the region (part) in overlap with the current region? */ + if (entry->addr >= end || entry->addr + entry->size <= start) continue; - /* if the region is at the beginning of <start,end> we move - * start to the end of the region since it's ok until there + /* + * If the region is at the beginning of <start,end> we move + * 'start' to the end of the region since it's ok until there */ - if (ei->addr <= start) - start = ei->addr + ei->size; + if (entry->addr <= start) + start = entry->addr + entry->size; + /* - * if start is now at or beyond end, we're done, full - * coverage + * If 'start' is now at or beyond 'end', we're done, full + * coverage of the desired range exists: */ if (start >= end) return 1; @@ -108,94 +128,77 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) } /* - * Add a memory region to the kernel e820 map. + * Add a memory region to the kernel E820 map. */ -static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, - int type) +static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) { - int x = e820x->nr_map; + int x = table->nr_entries; - if (x >= ARRAY_SIZE(e820x->map)) { - printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", - (unsigned long long) start, - (unsigned long long) (start + size - 1)); + if (x >= ARRAY_SIZE(table->entries)) { + pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1); return; } - e820x->map[x].addr = start; - e820x->map[x].size = size; - e820x->map[x].type = type; - e820x->nr_map++; + table->entries[x].addr = start; + table->entries[x].size = size; + table->entries[x].type = type; + table->nr_entries++; } -void __init e820_add_region(u64 start, u64 size, int type) +void __init e820__range_add(u64 start, u64 size, enum e820_type type) { - __e820_add_region(e820, start, size, type); + __e820__range_add(e820_table, start, size, type); } -static void __init e820_print_type(u32 type) +static void __init e820_print_type(enum e820_type type) { switch (type) { - case E820_RAM: - case E820_RESERVED_KERN: - printk(KERN_CONT "usable"); - break; - case E820_RESERVED: - printk(KERN_CONT "reserved"); - break; - case E820_ACPI: - printk(KERN_CONT "ACPI data"); - break; - case E820_NVS: - printk(KERN_CONT "ACPI NVS"); - break; - case E820_UNUSABLE: - printk(KERN_CONT "unusable"); - break; - case E820_PMEM: - case E820_PRAM: - printk(KERN_CONT "persistent (type %u)", type); - break; - default: - printk(KERN_CONT "type %u", type); - break; + case E820_TYPE_RAM: /* Fall through: */ + case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; + case E820_TYPE_RESERVED: pr_cont("reserved"); break; + case E820_TYPE_ACPI: pr_cont("ACPI data"); break; + case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; + case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; + case E820_TYPE_PMEM: /* Fall through: */ + case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break; + default: pr_cont("type %u", type); break; } } -void __init e820_print_map(char *who) +void __init e820__print_table(char *who) { int i; - for (i = 0; i < e820->nr_map; i++) { - printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820->map[i].addr, - (unsigned long long) - (e820->map[i].addr + e820->map[i].size - 1)); - e820_print_type(e820->map[i].type); - printk(KERN_CONT "\n"); + for (i = 0; i < e820_table->nr_entries; i++) { + pr_info("%s: [mem %#018Lx-%#018Lx] ", who, + e820_table->entries[i].addr, + e820_table->entries[i].addr + e820_table->entries[i].size - 1); + + e820_print_type(e820_table->entries[i].type); + pr_cont("\n"); } } /* - * Sanitize the BIOS e820 map. + * Sanitize an E820 map. * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps, + * Some E820 layouts include overlapping entries. The following + * replaces the original E820 map with a new one, removing overlaps, * and resolving conflicting memory types in favor of highest * numbered type. * - * The input parameter biosmap points to an array of 'struct - * e820entry' which on entry has elements in the range [0, *pnr_map) - * valid, and which has space for up to max_nr_map entries. - * On return, the resulting sanitized e820 map entries will be in - * overwritten in the same location, starting at biosmap. + * The input parameter 'entries' points to an array of 'struct + * e820_entry' which on entry has elements in the range [0, *nr_entries) + * valid, and which has space for up to max_nr_entries entries. + * On return, the resulting sanitized E820 map entries will be in + * overwritten in the same location, starting at 'entries'. * - * The integer pointed to by pnr_map must be valid on entry (the - * current number of valid entries located at biosmap). If the - * sanitizing succeeds the *pnr_map will be updated with the new - * number of valid entries (something no more than max_nr_map). + * The integer pointed to by nr_entries must be valid on entry (the + * current number of valid entries located at 'entries'). If the + * sanitizing succeeds the *nr_entries will be updated with the new + * number of valid entries (something no more than max_nr_entries). * - * The return value from sanitize_e820_map() is zero if it + * The return value from e820__update_table() is zero if it * successfully 'sanitized' the map entries passed in, and is -1 * if it did nothing, which can happen if either of (1) it was * only passed one map entry, or (2) any of the input map entries @@ -238,10 +241,17 @@ void __init e820_print_map(char *who) * ______________________4_ */ struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ + /* Pointer to the original entry: */ + struct e820_entry *entry; + /* Address for this change point: */ + unsigned long long addr; }; +static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata; +static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata; +static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata; +static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata; + static int __init cpcompare(const void *a, const void *b) { struct change_member * const *app = a, * const *bpp = b; @@ -249,164 +259,140 @@ static int __init cpcompare(const void *a, const void *b) /* * Inputs are pointers to two elements of change_point[]. If their - * addresses are unequal, their difference dominates. If the addresses + * addresses are not equal, their difference dominates. If the addresses * are equal, then consider one that represents the end of its region * to be greater than one that does not. */ if (ap->addr != bp->addr) return ap->addr > bp->addr ? 1 : -1; - return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr); + return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr); } -int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, - u32 *pnr_map) +int __init e820__update_table(struct e820_table *table) { - static struct change_member change_point_list[2*E820_X_MAX] __initdata; - static struct change_member *change_point[2*E820_X_MAX] __initdata; - static struct e820entry *overlap_list[E820_X_MAX] __initdata; - static struct e820entry new_bios[E820_X_MAX] __initdata; - unsigned long current_type, last_type; + struct e820_entry *entries = table->entries; + u32 max_nr_entries = ARRAY_SIZE(table->entries); + enum e820_type current_type, last_type; unsigned long long last_addr; - int chgidx; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; + u32 new_nr_entries, overlap_entries; + u32 i, chg_idx, chg_nr; - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) + /* If there's only one memory region, don't bother: */ + if (table->nr_entries < 2) return -1; - old_nr = *pnr_map; - BUG_ON(old_nr > max_nr_map); + BUG_ON(table->nr_entries > max_nr_entries); - /* bail out if we find any unreasonable addresses in bios map */ - for (i = 0; i < old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + /* Bail out if we find any unreasonable addresses in the map: */ + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].addr + entries[i].size < entries[i].addr) return -1; + } - /* create pointers for initial change-point information (for sorting) */ - for (i = 0; i < 2 * old_nr; i++) + /* Create pointers for initial change-point information (for sorting): */ + for (i = 0; i < 2 * table->nr_entries; i++) change_point[i] = &change_point_list[i]; - /* record all known change-points (starting and ending addresses), - omitting those that are for empty memory regions */ - chgidx = 0; - for (i = 0; i < old_nr; i++) { - if (biosmap[i].size != 0) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + - biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; + /* + * Record all known change-points (starting and ending addresses), + * omitting empty memory regions: + */ + chg_idx = 0; + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].size != 0) { + change_point[chg_idx]->addr = entries[i].addr; + change_point[chg_idx++]->entry = &entries[i]; + change_point[chg_idx]->addr = entries[i].addr + entries[i].size; + change_point[chg_idx++]->entry = &entries[i]; } } - chg_nr = chgidx; - - /* sort change-point list by memory addresses (low -> high) */ - sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL); - - /* create a new bios memory map, removing overlaps */ - overlap_entries = 0; /* number of entries in the overlap table */ - new_bios_entry = 0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - - /* loop through change-points, determining affect on the new bios map */ - for (chgidx = 0; chgidx < chg_nr; chgidx++) { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == - change_point[chgidx]->pbios->addr) { - /* - * add map entry to overlap list (> 1 entry - * implies an overlap) - */ - overlap_list[overlap_entries++] = - change_point[chgidx]->pbios; + chg_nr = chg_idx; + + /* Sort change-point list by memory addresses (low -> high): */ + sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL); + + /* Create a new memory map, removing overlaps: */ + overlap_entries = 0; /* Number of entries in the overlap table */ + new_nr_entries = 0; /* Index for creating new map entries */ + last_type = 0; /* Start with undefined memory type */ + last_addr = 0; /* Start with 0 as last starting address */ + + /* Loop through change-points, determining effect on the new map: */ + for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) { + /* Keep track of all overlapping entries */ + if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) { + /* Add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++] = change_point[chg_idx]->entry; } else { - /* - * remove entry from list (order independent, - * so swap with last) - */ + /* Remove entry from list (order independent, so swap with last): */ for (i = 0; i < overlap_entries; i++) { - if (overlap_list[i] == - change_point[chgidx]->pbios) - overlap_list[i] = - overlap_list[overlap_entries-1]; + if (overlap_list[i] == change_point[chg_idx]->entry) + overlap_list[i] = overlap_list[overlap_entries-1]; } overlap_entries--; } /* - * if there are overlapping entries, decide which + * If there are overlapping entries, decide which * "type" to use (larger value takes precedence -- * 1=usable, 2,3,4,4+=unusable) */ current_type = 0; - for (i = 0; i < overlap_entries; i++) + for (i = 0; i < overlap_entries; i++) { if (overlap_list[i]->type > current_type) current_type = overlap_list[i]->type; - /* - * continue building up new bios map based on this - * information - */ - if (current_type != last_type || current_type == E820_PRAM) { + } + + /* Continue building up new map based on this information: */ + if (current_type != last_type || current_type == E820_TYPE_PRAM) { if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* - * move forward only if the new size - * was non-zero - */ - if (new_bios[new_bios_entry].size != 0) - /* - * no more space left for new - * bios entries ? - */ - if (++new_bios_entry >= max_nr_map) + new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; + /* Move forward only if the new size was non-zero: */ + if (new_entries[new_nr_entries].size != 0) + /* No more space left for new entries? */ + if (++new_nr_entries >= max_nr_entries) break; } if (current_type != 0) { - new_bios[new_bios_entry].addr = - change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr = change_point[chgidx]->addr; + new_entries[new_nr_entries].addr = change_point[chg_idx]->addr; + new_entries[new_nr_entries].type = current_type; + last_addr = change_point[chg_idx]->addr; } last_type = current_type; } } - /* retain count for new bios entries */ - new_nr = new_bios_entry; - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); - *pnr_map = new_nr; + /* Copy the new entries into the original location: */ + memcpy(entries, new_entries, new_nr_entries*sizeof(*entries)); + table->nr_entries = new_nr_entries; return 0; } -static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) +static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { - while (nr_map) { - u64 start = biosmap->addr; - u64 size = biosmap->size; + struct boot_e820_entry *entry = entries; + + while (nr_entries) { + u64 start = entry->addr; + u64 size = entry->size; u64 end = start + size - 1; - u32 type = biosmap->type; + u32 type = entry->type; - /* Overflow in 64 bits? Ignore the memory map. */ + /* Ignore the entry on 64-bit overflow: */ if (start > end && likely(size)) return -1; - e820_add_region(start, size, type); + e820__range_add(start, size, type); - biosmap++; - nr_map--; + entry++; + nr_entries--; } return 0; } /* - * Copy the BIOS e820 map into a safe place. + * Copy the BIOS E820 map into a safe place. * * Sanity-check it while we're at it.. * @@ -414,18 +400,17 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. */ -static int __init append_e820_map(struct e820entry *biosmap, int nr_map) +static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) + if (nr_entries < 2) return -1; - return __append_e820_map(biosmap, nr_map); + return __append_e820_table(entries, nr_entries); } -static u64 __init __e820_update_range(struct e820map *e820x, u64 start, - u64 size, unsigned old_type, - unsigned new_type) +static u64 __init +__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { u64 end; unsigned int i; @@ -437,77 +422,73 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", - (unsigned long long) start, (unsigned long long) (end - 1)); + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1); e820_print_type(old_type); - printk(KERN_CONT " ==> "); + pr_cont(" ==> "); e820_print_type(new_type); - printk(KERN_CONT "\n"); + pr_cont("\n"); - for (i = 0; i < e820x->nr_map; i++) { - struct e820entry *ei = &e820x->map[i]; + for (i = 0; i < table->nr_entries; i++) { + struct e820_entry *entry = &table->entries[i]; u64 final_start, final_end; - u64 ei_end; + u64 entry_end; - if (ei->type != old_type) + if (entry->type != old_type) continue; - ei_end = ei->addr + ei->size; - /* totally covered by new range? */ - if (ei->addr >= start && ei_end <= end) { - ei->type = new_type; - real_updated_size += ei->size; + entry_end = entry->addr + entry->size; + + /* Completely covered by new range? */ + if (entry->addr >= start && entry_end <= end) { + entry->type = new_type; + real_updated_size += entry->size; continue; } - /* new range is totally covered? */ - if (ei->addr < start && ei_end > end) { - __e820_add_region(e820x, start, size, new_type); - __e820_add_region(e820x, end, ei_end - end, ei->type); - ei->size = start - ei->addr; + /* New range is completely covered? */ + if (entry->addr < start && entry_end > end) { + __e820__range_add(table, start, size, new_type); + __e820__range_add(table, end, entry_end - end, entry->type); + entry->size = start - entry->addr; real_updated_size += size; continue; } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(end, ei_end); + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); if (final_start >= final_end) continue; - __e820_add_region(e820x, final_start, final_end - final_start, - new_type); + __e820__range_add(table, final_start, final_end - final_start, new_type); real_updated_size += final_end - final_start; /* - * left range could be head or tail, so need to update - * size at first. + * Left range could be head or tail, so need to update + * its size first: */ - ei->size -= final_end - final_start; - if (ei->addr < final_start) + entry->size -= final_end - final_start; + if (entry->addr < final_start) continue; - ei->addr = final_end; + + entry->addr = final_end; } return real_updated_size; } -u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) +u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { - return __e820_update_range(e820, start, size, old_type, new_type); + return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820_update_range_saved(u64 start, u64 size, - unsigned old_type, unsigned new_type) +static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { - return __e820_update_range(e820_saved, start, size, old_type, - new_type); + return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); } -/* make e820 not cover the range */ -u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, - int checktype) +/* Remove a range of memory from the E820 table: */ +u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) { int i; u64 end; @@ -517,90 +498,89 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", - (unsigned long long) start, (unsigned long long) (end - 1)); - if (checktype) + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1); + if (check_type) e820_print_type(old_type); - printk(KERN_CONT "\n"); + pr_cont("\n"); - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; u64 final_start, final_end; - u64 ei_end; + u64 entry_end; - if (checktype && ei->type != old_type) + if (check_type && entry->type != old_type) continue; - ei_end = ei->addr + ei->size; - /* totally covered? */ - if (ei->addr >= start && ei_end <= end) { - real_removed_size += ei->size; - memset(ei, 0, sizeof(struct e820entry)); + entry_end = entry->addr + entry->size; + + /* Completely covered? */ + if (entry->addr >= start && entry_end <= end) { + real_removed_size += entry->size; + memset(entry, 0, sizeof(*entry)); continue; } - /* new range is totally covered? */ - if (ei->addr < start && ei_end > end) { - e820_add_region(end, ei_end - end, ei->type); - ei->size = start - ei->addr; + /* Is the new range completely covered? */ + if (entry->addr < start && entry_end > end) { + e820__range_add(end, entry_end - end, entry->type); + entry->size = start - entry->addr; real_removed_size += size; continue; } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(end, ei_end); + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); if (final_start >= final_end) continue; + real_removed_size += final_end - final_start; /* - * left range could be head or tail, so need to update - * size at first. + * Left range could be head or tail, so need to update + * the size first: */ - ei->size -= final_end - final_start; - if (ei->addr < final_start) + entry->size -= final_end - final_start; + if (entry->addr < final_start) continue; - ei->addr = final_end; + + entry->addr = final_end; } return real_removed_size; } -void __init update_e820(void) +void __init e820__update_table_print(void) { - if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map)) + if (e820__update_table(e820_table)) return; - printk(KERN_INFO "e820: modified physical RAM map:\n"); - e820_print_map("modified"); + + pr_info("e820: modified physical RAM map:\n"); + e820__print_table("modified"); } -static void __init update_e820_saved(void) + +static void __init e820__update_table_kexec(void) { - sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map), - &e820_saved->nr_map); + e820__update_table(e820_table_kexec); } + #define MAX_GAP_END 0x100000000ull + /* - * Search for a gap in the e820 memory space from start_addr to end_addr. + * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). */ -__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, - unsigned long start_addr, unsigned long long end_addr) +static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize) { - unsigned long long last; - int i = e820->nr_map; + unsigned long long last = MAX_GAP_END; + int i = e820_table->nr_entries; int found = 0; - last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; - while (--i >= 0) { - unsigned long long start = e820->map[i].addr; - unsigned long long end = start + e820->map[i].size; - - if (end < start_addr) - continue; + unsigned long long start = e820_table->entries[i].addr; + unsigned long long end = start + e820_table->entries[i].size; /* * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true + * fit in 32 bits if this condition is true: */ if (last > end) { unsigned long gap = last - end; @@ -618,153 +598,170 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, } /* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. We pass this space to PCI to assign MMIO resources - * for hotplug or unconfigured devices in. + * Search for the biggest gap in the low 32 bits of the E820 + * memory space. We pass this space to the PCI subsystem, so + * that it can assign MMIO resources for hotplug or + * unconfigured devices in. + * * Hopefully the BIOS let enough space left. */ -__init void e820_setup_gap(void) +__init void e820__setup_pci_gap(void) { unsigned long gapstart, gapsize; int found; - gapstart = 0x10000000; gapsize = 0x400000; - found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END); + found = e820_search_gap(&gapstart, &gapsize); -#ifdef CONFIG_X86_64 if (!found) { +#ifdef CONFIG_X86_64 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR - "e820: cannot find a gap in the 32bit address range\n" - "e820: PCI devices with unassigned 32bit BARs may break!\n"); - } + pr_err( + "e820: Cannot find an available gap in the 32-bit address range\n" + "e820: PCI devices with unassigned 32-bit BARs may not work!\n"); +#else + gapstart = 0x10000000; #endif + } /* - * e820_reserve_resources_late protect stolen RAM already + * e820__reserve_resources_late() protects stolen RAM already: */ pci_mem_start = gapstart; - printk(KERN_INFO - "e820: [mem %#010lx-%#010lx] available for PCI devices\n", - gapstart, gapstart + gapsize - 1); + pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1); } /* * Called late during init, in free_initmem(). * - * Initial e820 and e820_saved are largish __initdata arrays. - * Copy them to (usually much smaller) dynamically allocated area. - * This is done after all tweaks we ever do to them: - * all functions which modify them are __init functions, - * they won't exist after this point. + * Initial e820_table and e820_table_kexec are largish __initdata arrays. + * + * Copy them to a (usually much smaller) dynamically allocated area that is + * sized precisely after the number of e820 entries. + * + * This is done after we've performed all the fixes and tweaks to the tables. + * All functions which modify them are __init functions, which won't exist + * after free_initmem(). */ -__init void e820_reallocate_tables(void) +__init void e820__reallocate_tables(void) { - struct e820map *n; + struct e820_table *n; int size; - size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820_table, size); + e820_table = n; + + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries; n = kmalloc(size, GFP_KERNEL); BUG_ON(!n); - memcpy(n, e820, size); - e820 = n; + memcpy(n, e820_table_kexec, size); + e820_table_kexec = n; - size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries; n = kmalloc(size, GFP_KERNEL); BUG_ON(!n); - memcpy(n, e820_saved, size); - e820_saved = n; + memcpy(n, e820_table_firmware, size); + e820_table_firmware = n; } -/** - * Because of the size limitation of struct boot_params, only first - * 128 E820 memory entries are passed to kernel via - * boot_params.e820_map, others are passed via SETUP_E820_EXT node of - * linked list of struct setup_data, which is parsed here. +/* + * Because of the small fixed size of struct boot_params, only the first + * 128 E820 memory entries are passed to the kernel via boot_params.e820_table, + * the remaining (if any) entries are passed via the SETUP_E820_EXT node of + * struct setup_data, which is parsed here. */ -void __init parse_e820_ext(u64 phys_addr, u32 data_len) +void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) { int entries; - struct e820entry *extmap; + struct boot_e820_entry *extmap; struct setup_data *sdata; sdata = early_memremap(phys_addr, data_len); - entries = sdata->len / sizeof(struct e820entry); - extmap = (struct e820entry *)(sdata->data); - __append_e820_map(extmap, entries); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + entries = sdata->len / sizeof(*extmap); + extmap = (struct boot_e820_entry *)(sdata->data); + + __append_e820_table(extmap, entries); + e820__update_table(e820_table); + + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + early_memunmap(sdata, data_len); - printk(KERN_INFO "e820: extended physical RAM map:\n"); - e820_print_map("extended"); + pr_info("e820: extended physical RAM map:\n"); + e820__print_table("extended"); } -#if defined(CONFIG_X86_64) || \ - (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) -/** +/* * Find the ranges of physical addresses that do not correspond to - * e820 RAM areas and mark the corresponding pages as nosave for - * hibernation (32 bit) or software suspend and suspend to RAM (64 bit). + * E820 RAM areas and register the corresponding pages as 'nosave' for + * hibernation (32-bit) or software suspend and suspend to RAM (64-bit). * - * This function requires the e820 map to be sorted and without any + * This function requires the E820 map to be sorted and without any * overlapping entries. */ -void __init e820_mark_nosave_regions(unsigned long limit_pfn) +void __init e820__register_nosave_regions(unsigned long limit_pfn) { int i; unsigned long pfn = 0; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (pfn < PFN_UP(ei->addr)) - register_nosave_region(pfn, PFN_UP(ei->addr)); + if (pfn < PFN_UP(entry->addr)) + register_nosave_region(pfn, PFN_UP(entry->addr)); - pfn = PFN_DOWN(ei->addr + ei->size); + pfn = PFN_DOWN(entry->addr + entry->size); - if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) - register_nosave_region(PFN_UP(ei->addr), pfn); + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + register_nosave_region(PFN_UP(entry->addr), pfn); if (pfn >= limit_pfn) break; } } -#endif #ifdef CONFIG_ACPI -/** - * Mark ACPI NVS memory region, so that we can save/restore it during - * hibernation and the subsequent resume. +/* + * Register ACPI NVS memory regions, so that we can save/restore them during + * hibernation and the subsequent resume: */ -static int __init e820_mark_nvs_memory(void) +static int __init e820__register_nvs_regions(void) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (ei->type == E820_NVS) - acpi_nvs_register(ei->addr, ei->size); + if (entry->type == E820_TYPE_NVS) + acpi_nvs_register(entry->addr, entry->size); } return 0; } -core_initcall(e820_mark_nvs_memory); +core_initcall(e820__register_nvs_regions); #endif /* - * pre allocated 4k and reserved it in memblock and e820_saved + * Allocate the requested number of bytes with the requsted alignment + * and return (the physical address) to the caller. Also register this + * range in the 'kexec' E820 table as a reserved range. + * + * This allows kexec to fake a new mptable, as if it came from the real + * system. */ -u64 __init early_reserve_e820(u64 size, u64 align) +u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) { u64 addr; addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { - e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); - printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n"); - update_e820_saved(); + e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + pr_info("e820: update e820_table_kexec for e820__memblock_alloc_reserved()\n"); + e820__update_table_kexec(); } return addr; @@ -783,22 +780,22 @@ u64 __init early_reserve_e820(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type) { int i; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; unsigned long start_pfn; unsigned long end_pfn; - if (ei->type != type) + if (entry->type != type) continue; - start_pfn = ei->addr >> PAGE_SHIFT; - end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; + start_pfn = entry->addr >> PAGE_SHIFT; + end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT; if (start_pfn >= limit_pfn) continue; @@ -813,18 +810,19 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; - printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n", + pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; } -unsigned long __init e820_end_of_ram_pfn(void) + +unsigned long __init e820__end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); + return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM); } -unsigned long __init e820_end_of_low_ram_pfn(void) +unsigned long __init e820__end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM); + return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM); } static void __init early_panic(char *msg) @@ -835,7 +833,7 @@ static void __init early_panic(char *msg) static int userdef __initdata; -/* "mem=nopentium" disables the 4MB page tables. */ +/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */ static int __init parse_memopt(char *p) { u64 mem_size; @@ -848,17 +846,19 @@ static int __init parse_memopt(char *p) setup_clear_cpu_cap(X86_FEATURE_PSE); return 0; #else - printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + pr_warn("mem=nopentium ignored! (only supported on x86_32)\n"); return -EINVAL; #endif } userdef = 1; mem_size = memparse(p, &p); - /* don't remove all of memory when handling "mem={invalid}" param */ + + /* Don't remove all memory when getting "mem={invalid}" parameter: */ if (mem_size == 0) return -EINVAL; - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); return 0; } @@ -876,12 +876,12 @@ static int __init parse_memmap_one(char *p) #ifdef CONFIG_CRASH_DUMP /* * If we are doing a crash dump, we still need to know - * the real mem size before original memory map is + * the real memory size before the original memory map is * reset. */ - saved_max_pfn = e820_end_of_ram_pfn(); + saved_max_pfn = e820__end_of_ram_pfn(); #endif - e820->nr_map = 0; + e820_table->nr_entries = 0; userdef = 1; return 0; } @@ -894,21 +894,23 @@ static int __init parse_memmap_one(char *p) userdef = 1; if (*p == '@') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_RAM); + e820__range_add(start_at, mem_size, E820_TYPE_RAM); } else if (*p == '#') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_ACPI); + e820__range_add(start_at, mem_size, E820_TYPE_ACPI); } else if (*p == '$') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_RESERVED); + e820__range_add(start_at, mem_size, E820_TYPE_RESERVED); } else if (*p == '!') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_PRAM); - } else - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + e820__range_add(start_at, mem_size, E820_TYPE_PRAM); + } else { + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + } return *p == '\0' ? 0 : -EINVAL; } + static int __init parse_memmap_opt(char *str) { while (str) { @@ -925,68 +927,97 @@ static int __init parse_memmap_opt(char *str) } early_param("memmap", parse_memmap_opt); -void __init finish_e820_parsing(void) +/* + * Reserve all entries from the bootloader's extensible data nodes list, + * because if present we are going to use it later on to fetch e820 + * entries from it: + */ +void __init e820__reserve_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return; + + while (pa_data) { + data = early_memremap(pa_data, sizeof(*data)); + e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + pa_data = data->next; + early_memunmap(data, sizeof(*data)); + } + + e820__update_table(e820_table); + e820__update_table(e820_table_kexec); + + pr_info("extended physical RAM map:\n"); + e820__print_table("reserve setup_data"); +} + +/* + * Called after parse_early_param(), after early parameters (such as mem=) + * have been processed, in which case we already have an E820 table filled in + * via the parameter callback function(s), but it's not sorted and printed yet: + */ +void __init e820__finish_early_params(void) { if (userdef) { - if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), - &e820->nr_map) < 0) + if (e820__update_table(e820_table) < 0) early_panic("Invalid user supplied memory map"); - printk(KERN_INFO "e820: user-defined physical RAM map:\n"); - e820_print_map("user"); + pr_info("e820: user-defined physical RAM map:\n"); + e820__print_table("user"); } } -static const char *__init e820_type_to_string(int e820_type) +static const char *__init e820_type_to_string(struct e820_entry *entry) { - switch (e820_type) { - case E820_RESERVED_KERN: - case E820_RAM: return "System RAM"; - case E820_ACPI: return "ACPI Tables"; - case E820_NVS: return "ACPI Non-volatile Storage"; - case E820_UNUSABLE: return "Unusable memory"; - case E820_PRAM: return "Persistent Memory (legacy)"; - case E820_PMEM: return "Persistent Memory"; - default: return "reserved"; + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return "System RAM"; + case E820_TYPE_ACPI: return "ACPI Tables"; + case E820_TYPE_NVS: return "ACPI Non-volatile Storage"; + case E820_TYPE_UNUSABLE: return "Unusable memory"; + case E820_TYPE_PRAM: return "Persistent Memory (legacy)"; + case E820_TYPE_PMEM: return "Persistent Memory"; + case E820_TYPE_RESERVED: return "Reserved"; + default: return "Unknown E820 type"; } } -static unsigned long __init e820_type_to_iomem_type(int e820_type) +static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) { - switch (e820_type) { - case E820_RESERVED_KERN: - case E820_RAM: - return IORESOURCE_SYSTEM_RAM; - case E820_ACPI: - case E820_NVS: - case E820_UNUSABLE: - case E820_PRAM: - case E820_PMEM: - default: - return IORESOURCE_MEM; + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; + case E820_TYPE_ACPI: /* Fall-through: */ + case E820_TYPE_NVS: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + case E820_TYPE_PRAM: /* Fall-through: */ + case E820_TYPE_PMEM: /* Fall-through: */ + case E820_TYPE_RESERVED: /* Fall-through: */ + default: return IORESOURCE_MEM; } } -static unsigned long __init e820_type_to_iores_desc(int e820_type) +static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) { - switch (e820_type) { - case E820_ACPI: - return IORES_DESC_ACPI_TABLES; - case E820_NVS: - return IORES_DESC_ACPI_NV_STORAGE; - case E820_PMEM: - return IORES_DESC_PERSISTENT_MEMORY; - case E820_PRAM: - return IORES_DESC_PERSISTENT_MEMORY_LEGACY; - case E820_RESERVED_KERN: - case E820_RAM: - case E820_UNUSABLE: - default: - return IORES_DESC_NONE; + switch (entry->type) { + case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES; + case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE; + case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; + case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + case E820_TYPE_RESERVED: /* Fall-through: */ + default: return IORES_DESC_NONE; } } -static bool __init do_mark_busy(u32 type, struct resource *res) +static bool __init do_mark_busy(enum e820_type type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ if (res->start < (1ULL<<20)) @@ -997,61 +1028,72 @@ static bool __init do_mark_busy(u32 type, struct resource *res) * for exclusive use of a driver */ switch (type) { - case E820_RESERVED: - case E820_PRAM: - case E820_PMEM: + case E820_TYPE_RESERVED: + case E820_TYPE_PRAM: + case E820_TYPE_PMEM: return false; + case E820_TYPE_RESERVED_KERN: + case E820_TYPE_RAM: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: default: return true; } } /* - * Mark e820 reserved areas as busy for the resource manager. + * Mark E820 reserved areas as busy for the resource manager: */ + static struct resource __initdata *e820_res; -void __init e820_reserve_resources(void) + +void __init e820__reserve_resources(void) { int i; struct resource *res; u64 end; - res = alloc_bootmem(sizeof(struct resource) * e820->nr_map); + res = alloc_bootmem(sizeof(*res) * e820_table->nr_entries); e820_res = res; - for (i = 0; i < e820->nr_map; i++) { - end = e820->map[i].addr + e820->map[i].size - 1; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = e820_table->entries + i; + + end = entry->addr + entry->size - 1; if (end != (resource_size_t)end) { res++; continue; } - res->name = e820_type_to_string(e820->map[i].type); - res->start = e820->map[i].addr; - res->end = end; - - res->flags = e820_type_to_iomem_type(e820->map[i].type); - res->desc = e820_type_to_iores_desc(e820->map[i].type); + res->start = entry->addr; + res->end = end; + res->name = e820_type_to_string(entry); + res->flags = e820_type_to_iomem_type(entry); + res->desc = e820_type_to_iores_desc(entry); /* - * don't register the region that could be conflicted with - * pci device BAR resource and insert them later in - * pcibios_resource_survey() + * Don't register the region that could be conflicted with + * PCI device BAR resources and insert them later in + * pcibios_resource_survey(): */ - if (do_mark_busy(e820->map[i].type, res)) { + if (do_mark_busy(entry->type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } res++; } - for (i = 0; i < e820_saved->nr_map; i++) { - struct e820entry *entry = &e820_saved->map[i]; - firmware_map_add_early(entry->addr, - entry->addr + entry->size, - e820_type_to_string(entry->type)); + /* Expose the bootloader-provided memory layout to the sysfs. */ + for (i = 0; i < e820_table_firmware->nr_entries; i++) { + struct e820_entry *entry = e820_table_firmware->entries + i; + + firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); } } -/* How much should we pad RAM ending depending on where it is? */ +/* + * How much should we pad the end of RAM, depending on where it is? + */ static unsigned long __init ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; @@ -1070,64 +1112,59 @@ static unsigned long __init ram_alignment(resource_size_t pos) #define MAX_RESOURCE_SIZE ((resource_size_t)-1) -void __init e820_reserve_resources_late(void) +void __init e820__reserve_resources_late(void) { int i; struct resource *res; res = e820_res; - for (i = 0; i < e820->nr_map; i++) { + for (i = 0; i < e820_table->nr_entries; i++) { if (!res->parent && res->end) insert_resource_expand_to_fit(&iomem_resource, res); res++; } /* - * Try to bump up RAM regions to reasonable boundaries to + * Try to bump up RAM regions to reasonable boundaries, to * avoid stolen RAM: */ - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *entry = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; u64 start, end; - if (entry->type != E820_RAM) + if (entry->type != E820_TYPE_RAM) continue; + start = entry->addr + entry->size; end = round_up(start, ram_alignment(start)) - 1; if (end > MAX_RESOURCE_SIZE) end = MAX_RESOURCE_SIZE; if (start >= end) continue; - printk(KERN_DEBUG - "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", - start, end); - reserve_region_with_split(&iomem_resource, start, end, - "RAM buffer"); + + printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end); + reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } } -char *__init default_machine_specific_memory_setup(void) +/* + * Pass the firmware (bootloader) E820 map to the kernel and process it: + */ +char *__init e820__memory_setup_default(void) { char *who = "BIOS-e820"; - u32 new_nr; + /* * Try to copy the BIOS-supplied E820-map. * * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - new_nr = boot_params.e820_entries; - sanitize_e820_map(boot_params.e820_map, - ARRAY_SIZE(boot_params.e820_map), - &new_nr); - boot_params.e820_entries = new_nr; - if (append_e820_map(boot_params.e820_map, boot_params.e820_entries) - < 0) { + if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) { u64 mem_size; - /* compare results from other methods and take the greater */ - if (boot_params.alt_mem_k - < boot_params.screen_info.ext_mem_k) { + /* Compare results from other methods and take the one that gives more RAM: */ + if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) { mem_size = boot_params.screen_info.ext_mem_k; who = "BIOS-88"; } else { @@ -1135,84 +1172,69 @@ char *__init default_machine_specific_memory_setup(void) who = "BIOS-e801"; } - e820->nr_map = 0; - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); + e820_table->nr_entries = 0; + e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM); + e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM); } - /* In case someone cares... */ + /* We just appended a lot of ranges, sanitize the table: */ + e820__update_table(e820_table); + return who; } -void __init setup_memory_map(void) +/* + * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader + * E820 map - with an optional platform quirk available for virtual platforms + * to override this method of boot environment processing: + */ +void __init e820__memory_setup(void) { char *who; + /* This is a firmware interface ABI - make sure we don't break it: */ + BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20); + who = x86_init.resources.memory_setup(); - memcpy(e820_saved, e820, sizeof(struct e820map)); - printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); - e820_print_map(who); + + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + + pr_info("e820: BIOS-provided physical RAM map:\n"); + e820__print_table(who); } -void __init memblock_x86_fill(void) +void __init e820__memblock_setup(void) { int i; u64 end; /* - * EFI may have more than 128 entries - * We are safe to enable resizing, beause memblock_x86_fill() - * is rather later for x86 + * The bootstrap memblock region count maximum is 128 entries + * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries + * than that - so allow memblock resizing. + * + * This is safe, because this call happens pretty late during x86 setup, + * so we know about reserved memory regions already. (This is important + * so that memblock resizing does no stomp over reserved areas.) */ memblock_allow_resize(); - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - end = ei->addr + ei->size; + end = entry->addr + entry->size; if (end != (resource_size_t)end) continue; - if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) continue; - memblock_add(ei->addr, ei->size); + memblock_add(entry->addr, entry->size); } - /* throw away partial pages */ + /* Throw away partial pages: */ memblock_trim_memory(PAGE_SIZE); memblock_dump_all(); } - -void __init memblock_find_dma_reserve(void) -{ -#ifdef CONFIG_X86_64 - u64 nr_pages = 0, nr_free_pages = 0; - unsigned long start_pfn, end_pfn; - phys_addr_t start, end; - int i; - u64 u; - - /* - * need to find out used area below MAX_DMA_PFN - * need to use memblock to get free size in [0, MAX_DMA_PFN] - * at first, and assume boot_mem will not take below MAX_DMA_PFN - */ - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min(start_pfn, MAX_DMA_PFN); - end_pfn = min(end_pfn, MAX_DMA_PFN); - nr_pages += end_pfn - start_pfn; - } - - for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, - NULL) { - start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); - end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); - if (start_pfn < end_pfn) - nr_free_pages += end_pfn - start_pfn; - } - - set_dma_reserve(nr_pages - nr_free_pages); -#endif -} diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6a08e25a48d8..d907c3d8633f 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -526,6 +526,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_SKL_IDS(&gen9_early_ops), INTEL_BXT_IDS(&gen9_early_ops), INTEL_KBL_IDS(&gen9_early_ops), + INTEL_GLK_IDS(&gen9_early_ops), }; static void __init @@ -546,8 +547,8 @@ intel_graphics_stolen(int num, int slot, int func, &base, &end); /* Mark this space as reserved */ - e820_add_region(base, size, E820_RESERVED); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__range_add(base, size, E820_TYPE_RESERVED); + e820__update_table(e820_table); } static void __init intel_graphics_quirks(int num, int slot, int func) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 8a121991e5ba..0f0840304452 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -17,6 +17,7 @@ #include <asm/intel-mid.h> #include <asm/pgtable.h> #include <linux/usb/ehci_def.h> +#include <linux/usb/xhci-dbgp.h> #include <linux/efi.h> #include <asm/efi.h> #include <asm/pci_x86.h> @@ -381,6 +382,10 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "efi", 3)) early_console_register(&early_efi_console, keep); #endif +#ifdef CONFIG_EARLY_PRINTK_USB_XDBC + if (!strncmp(buf, "xdbc", 4)) + early_xdbc_parse_parameter(buf + 4); +#endif buf++; } diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 04f89caef9c4..6b91e2eb8d3f 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -50,11 +50,11 @@ #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) /* There is address space for how many espfix pages? */ -#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) +#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16)) #define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS -# error "Need more than one PGD for the ESPFIX hack" +# error "Need more virtual address space for the ESPFIX hack" #endif #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) @@ -121,11 +121,13 @@ static void init_espfix_random(void) void __init init_espfix_bsp(void) { - pgd_t *pgd_p; + pgd_t *pgd; + p4d_t *p4d; /* Install the espfix pud into the kernel page directory */ - pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; - pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); + p4d_populate(&init_mm, p4d, espfix_pud_page); /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index de7234401275..e1114f070c2d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -9,7 +9,6 @@ #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> -#include <asm/fpu/xstate.h> #include <asm/traps.h> #include <linux/hardirq.h> @@ -179,14 +178,8 @@ void fpstate_init(union fpregs_state *state) memset(state, 0, fpu_kernel_xstate_size); - /* - * XRSTORS requires that this bit is set in xcomp_bv, or - * it will #GP. Make sure it is replaced after the memset(). - */ if (static_cpu_has(X86_FEATURE_XSAVES)) - state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | - xfeatures_mask; - + fpstate_init_xstate(&state->xsave); if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); else diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 60dece392b3a..d5d44c452624 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -7,6 +7,7 @@ #include <asm/cmdline.h> #include <linux/sched.h> +#include <linux/sched/task.h> #include <linux/init.h> /* @@ -48,13 +49,7 @@ void fpu__init_cpu(void) fpu__init_cpu_xstate(); } -/* - * The earliest FPU detection code. - * - * Set the X86_FEATURE_FPU CPU-capability bit based on - * trying to execute an actual sequence of FPU instructions: - */ -static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +static bool fpu__probe_without_cpuid(void) { unsigned long cr0; u16 fsw, fcw; @@ -65,18 +60,25 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) cr0 &= ~(X86_CR0_TS | X86_CR0_EM); write_cr0(cr0); - if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { - asm volatile("fninit ; fnstsw %0 ; fnstcw %1" - : "+m" (fsw), "+m" (fcw)); + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw)); + + pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw); - if (fsw == 0 && (fcw & 0x103f) == 0x003f) - set_cpu_cap(c, X86_FEATURE_FPU); + return fsw == 0 && (fcw & 0x103f) == 0x003f; +} + +static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +{ + if (!boot_cpu_has(X86_FEATURE_CPUID) && + !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { + if (fpu__probe_without_cpuid()) + setup_force_cpu_cap(X86_FEATURE_FPU); else - clear_cpu_cap(c, X86_FEATURE_FPU); + setup_clear_cpu_cap(X86_FEATURE_FPU); } #ifndef CONFIG_MATH_EMULATION - if (!boot_cpu_has(X86_FEATURE_FPU)) { + if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) { pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n"); for (;;) asm volatile("hlt"); @@ -88,6 +90,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) * Boot time FPU feature detection code: */ unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +EXPORT_SYMBOL_GPL(mxcsr_feature_mask); static void __init fpu__init_system_mxcsr(void) { diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index c114b132d121..b188b16841e3 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -5,6 +5,7 @@ #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> +#include <linux/sched/task_stack.h> /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1d7770447b3e..c24ac1efb12d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -78,6 +78,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_PKU); setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); + setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* @@ -705,8 +706,14 @@ void __init fpu__init_system_xstate(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; + if (!boot_cpu_has(X86_FEATURE_FPU)) { + pr_info("x86/fpu: No FPU detected\n"); + return; + } + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { - pr_info("x86/fpu: Legacy x87 FPU detected.\n"); + pr_info("x86/fpu: x87 FPU will use %s\n", + boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); return; } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8639bb2ae058..9bef1bbeba63 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,7 +24,7 @@ #include <trace/syscall.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> @@ -533,9 +533,15 @@ static void do_sync_core(void *data) static void run_sync(void) { - int enable_irqs = irqs_disabled(); + int enable_irqs; - /* We may be called with interrupts disbled (on bootup). */ + /* No need to sync if there's only one CPU */ + if (num_online_cpus() == 1) + return; + + enable_irqs = irqs_disabled(); + + /* We may be called with interrupts disabled (on bootup). */ if (enable_irqs) local_irq_enable(); on_each_cpu(do_sync_core, NULL, 1); @@ -683,8 +689,12 @@ static inline void *alloc_tramp(unsigned long size) { return module_alloc(size); } -static inline void tramp_free(void *tramp) +static inline void tramp_free(void *tramp, int size) { + int npages = PAGE_ALIGN(size) >> PAGE_SHIFT; + + set_memory_nx((unsigned long)tramp, npages); + set_memory_rw((unsigned long)tramp, npages); module_memfree(tramp); } #else @@ -693,7 +703,7 @@ static inline void *alloc_tramp(unsigned long size) { return NULL; } -static inline void tramp_free(void *tramp) { } +static inline void tramp_free(void *tramp, int size) { } #endif /* Defined as markers to the end of the ftrace default trampolines */ @@ -765,7 +775,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* Copy ftrace_caller onto the trampoline memory */ ret = probe_kernel_read(trampoline, (void *)start_offset, size); if (WARN_ON(ret < 0)) { - tramp_free(trampoline); + tramp_free(trampoline, *tramp_size); return 0; } @@ -791,7 +801,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* Are we pointing to the reference? */ if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) { - tramp_free(trampoline); + tramp_free(trampoline, *tramp_size); return 0; } @@ -833,7 +843,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) unsigned long offset; unsigned long ip; unsigned int size; - int ret; + int ret, npages; if (ops->trampoline) { /* @@ -842,11 +852,14 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) */ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) return; + npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT; + set_memory_rw(ops->trampoline, npages); } else { ops->trampoline = create_trampoline(ops, &size); if (!ops->trampoline) return; ops->trampoline_size = size; + npages = PAGE_ALIGN(size) >> PAGE_SHIFT; } offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); @@ -857,6 +870,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); ret = update_ftrace_func(ip, new); + set_memory_ro(ops->trampoline, npages); /* The update should never fail */ WARN_ON(ret); @@ -933,7 +947,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) return; - tramp_free((void *)ops->trampoline); + tramp_free((void *)ops->trampoline, ops->trampoline_size); ops->trampoline = 0; } @@ -983,6 +997,18 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long return_hooker = (unsigned long) &return_to_handler; + /* + * When resuming from suspend-to-ram, this function can be indirectly + * called from early CPU startup code while the CPU is in real mode, + * which would fail miserably. Make sure the stack pointer is a + * virtual address. + * + * This check isn't as accurate as virt_addr_valid(), but it should be + * good enough for this purpose, and it's fast. + */ + if (unlikely((long)__builtin_frame_address(0) >= 0)) + return; + if (unlikely(ftrace_graph_is_dead())) return; diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S new file mode 100644 index 000000000000..722a145b4139 --- /dev/null +++ b/arch/x86/kernel/ftrace_32.S @@ -0,0 +1,244 @@ +/* + * Copyright (C) 2017 Steven Rostedt, VMware Inc. + */ + +#include <linux/linkage.h> +#include <asm/page_types.h> +#include <asm/segment.h> +#include <asm/export.h> +#include <asm/ftrace.h> + +#ifdef CC_USING_FENTRY +# define function_hook __fentry__ +EXPORT_SYMBOL(__fentry__) +#else +# define function_hook mcount +EXPORT_SYMBOL(mcount) +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* mcount uses a frame pointer even if CONFIG_FRAME_POINTER is not set */ +#if !defined(CC_USING_FENTRY) || defined(CONFIG_FRAME_POINTER) +# define USING_FRAME_POINTER +#endif + +#ifdef USING_FRAME_POINTER +# define MCOUNT_FRAME 1 /* using frame = true */ +#else +# define MCOUNT_FRAME 0 /* using frame = false */ +#endif + +ENTRY(function_hook) + ret +END(function_hook) + +ENTRY(ftrace_caller) + +#ifdef USING_FRAME_POINTER +# ifdef CC_USING_FENTRY + /* + * Frame pointers are of ip followed by bp. + * Since fentry is an immediate jump, we are left with + * parent-ip, function-ip. We need to add a frame with + * parent-ip followed by ebp. + */ + pushl 4(%esp) /* parent ip */ + pushl %ebp + movl %esp, %ebp + pushl 2*4(%esp) /* function ip */ +# endif + /* For mcount, the function ip is directly above */ + pushl %ebp + movl %esp, %ebp +#endif + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + +#ifdef USING_FRAME_POINTER + /* Load parent ebp into edx */ + movl 4*4(%esp), %edx +#else + /* There's no frame pointer, load the appropriate stack addr instead */ + lea 4*4(%esp), %edx +#endif + + movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */ + /* Get the parent ip */ + movl 4(%edx), %edx /* edx has ebp */ + + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4, %esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +#ifdef USING_FRAME_POINTER + popl %ebp +# ifdef CC_USING_FENTRY + addl $4,%esp /* skip function ip */ + popl %ebp /* this is the orig bp */ + addl $4, %esp /* skip parent ip */ +# endif +#endif +.Lftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +/* This is weak to keep gas from relaxing the jumps */ +WEAK(ftrace_stub) + ret +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * regs->ip location, and move flags into the return ip location. + */ + pushl $__KERNEL_CS + pushl 4(%esp) /* Save the return ip */ + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + + /* Get flags and place them into the return ip slot */ + pushf + popl %eax + movl %eax, 8*4(%esp) + + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ +#ifdef CC_USING_FENTRY + movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */ +#else + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ +#endif + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + + /* restore flags */ + push 14*4(%esp) + popf + + /* Move return ip back to its original location */ + movl 12*4(%esp), %eax + movl %eax, 14*4(%esp) + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + + /* use lea to not affect flags */ + lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */ + + jmp .Lftrace_ret +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(function_hook) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + + cmpl $ftrace_stub, ftrace_trace_function + jnz .Ltrace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +.Ltrace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(function_hook) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 3*4(%esp), %eax + /* Even with frame pointers, fentry doesn't have one here */ +#ifdef CC_USING_FENTRY + lea 4*4(%esp), %edx + movl $0, %ecx +#else + lea 0x4(%ebp), %edx + movl (%ebp), %ecx +#endif + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx +#ifdef CC_USING_FENTRY + movl $0, %eax +#else + movl %ebp, %eax +#endif + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx +#endif diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/ftrace_64.S index 7b0d3da52fb4..1dfac634bbf7 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -1,6 +1,4 @@ /* - * linux/arch/x86_64/mcount_64.S - * * Copyright (C) 2014 Steven Rostedt, Red Hat Inc */ @@ -13,9 +11,6 @@ .code64 .section .entry.text, "ax" - -#ifdef CONFIG_FUNCTION_TRACER - #ifdef CC_USING_FENTRY # define function_hook __fentry__ EXPORT_SYMBOL(__fentry__) @@ -297,7 +292,6 @@ trace: jmp fgraph_trace END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index f16c55bfc090..538ec012b371 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -12,7 +12,7 @@ #include <asm/setup.h> #include <asm/sections.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/page.h> #include <asm/apic.h> #include <asm/io_apic.h> @@ -49,3 +49,65 @@ asmlinkage __visible void __init i386_start_kernel(void) start_kernel(); } + +/* + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond __brk_base. The variable + * _brk_end is set up to point to the first "safe" location. + * Mappings are created both at virtual address 0 (identity mapping) + * and PAGE_OFFSET for up to _end. + * + * In PAE mode initial_page_table is statically defined to contain + * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD entries + * to the first kernel PMD. Note the upper half of each PMD or PTE are + * always zero at this stage. + */ +void __init mk_early_pgtbl_32(void) +{ +#ifdef __pa +#undef __pa +#endif +#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) + pte_t pte, *ptep; + int i; + unsigned long *ptr; + /* Enough space to fit pagetables for the low memory linear map */ + const unsigned long limit = __pa(_end) + + (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT); +#ifdef CONFIG_X86_PAE + pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd); +#define SET_PL2(pl2, val) { (pl2).pmd = (val); } +#else + pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table); +#define SET_PL2(pl2, val) { (pl2).pgd = (val); } +#endif + + ptep = (pte_t *)__pa(__brk_base); + pte.pte = PTE_IDENT_ATTR; + + while ((pte.pte & PTE_PFN_MASK) < limit) { + + SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR); + *pl2p = pl2; +#ifndef CONFIG_X86_PAE + /* Kernel PDE entry */ + *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2; +#endif + for (i = 0; i < PTRS_PER_PTE; i++) { + *ptep = pte; + pte.pte += PAGE_SIZE; + ptep++; + } + + pl2p++; + } + + ptr = (unsigned long *)__pa(&max_pfn_mapped); + /* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */ + *ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; + + ptr = (unsigned long *)__pa(&_brk_end); + *ptr = (unsigned long)ptep + PAGE_OFFSET; +} + diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 54a2372f5dbb..46c3c73e7f43 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -4,6 +4,7 @@ * Copyright (C) 2000 Andrea Arcangeli <[email protected]> SuSE */ +#define DISABLE_BRANCH_PROFILING #include <linux/init.h> #include <linux/linkage.h> #include <linux/types.h> @@ -23,7 +24,7 @@ #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/kdebug.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/bios_ebda.h> #include <asm/bootparam_utils.h> #include <asm/microcode.h> @@ -32,17 +33,120 @@ /* * Manage page tables very early on. */ -extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; -static unsigned int __initdata next_early_pgt = 2; +static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); +#define __head __section(.head.text) + +static void __head *fixup_pointer(void *ptr, unsigned long physaddr) +{ + return ptr - (void *)_text + (void *)physaddr; +} + +void __head __startup_64(unsigned long physaddr) +{ + unsigned long load_delta, *p; + pgdval_t *pgd; + p4dval_t *p4d; + pudval_t *pud; + pmdval_t *pmd, pmd_entry; + int i; + + /* Is the address too large? */ + if (physaddr >> MAX_PHYSMEM_BITS) + for (;;); + + /* + * Compute the delta between the address I am compiled to run at + * and the address I am actually running at. + */ + load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); + + /* Is the address not 2M aligned? */ + if (load_delta & ~PMD_PAGE_MASK) + for (;;); + + /* Fixup the physical addresses in the page table */ + + pgd = fixup_pointer(&early_top_pgt, physaddr); + pgd[pgd_index(__START_KERNEL_map)] += load_delta; + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = fixup_pointer(&level4_kernel_pgt, physaddr); + p4d[511] += load_delta; + } + + pud = fixup_pointer(&level3_kernel_pgt, physaddr); + pud[510] += load_delta; + pud[511] += load_delta; + + pmd = fixup_pointer(level2_fixmap_pgt, physaddr); + pmd[506] += load_delta; + + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ + + pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; + pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; + + i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; + p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; + p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + } else { + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; + pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + } + + i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; + pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; + pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; + + pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += physaddr; + + for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { + int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD; + pmd[idx] = pmd_entry + i * PMD_SIZE; + } + + /* + * Fixup the kernel text+data virtual addresses. Note that + * we might write invalid pmds, when the kernel is relocated + * cleanup_highmap() fixes this up along with the mappings + * beyond _end. + */ + + pmd = fixup_pointer(level2_kernel_pgt, physaddr); + for (i = 0; i < PTRS_PER_PMD; i++) { + if (pmd[i] & _PAGE_PRESENT) + pmd[i] += load_delta; + } + + /* Fixup phys_base */ + p = fixup_pointer(&phys_base, physaddr); + *p += load_delta; +} + /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { - memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); + memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_level4_pgt)); + write_cr3(__pa_nodebug(early_top_pgt)); } /* Create a new PMD entry */ @@ -50,15 +154,16 @@ int __init early_make_pgtable(unsigned long address) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; + p4dval_t p4d, *p4d_p; pudval_t pud, *pud_p; pmdval_t pmd, *pmd_p; /* Invalid address or early pgt is done ? */ - if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) + if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) return -1; again: - pgd_p = &early_level4_pgt[pgd_index(address)].pgd; + pgd_p = &early_top_pgt[pgd_index(address)].pgd; pgd = *pgd_p; /* @@ -66,8 +171,25 @@ again: * critical -- __PAGE_OFFSET would point us back into the dynamic * range and we might end up looping forever... */ - if (pgd) - pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + p4d_p = pgd_p; + else if (pgd) + p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++]; + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); + *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + p4d_p += p4d_index(address); + p4d = *p4d_p; + + if (p4d) + pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); else { if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { reset_early_page_tables(); @@ -76,7 +198,7 @@ again: pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pud_p += pud_index(address); pud = *pud_p; @@ -155,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); - clear_page(init_level4_pgt); + clear_page(init_top_pgt); kasan_early_init(); @@ -170,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - /* set init_level4_pgt kernel high mapping*/ - init_level4_pgt[511] = early_level4_pgt[511]; + /* set init_top_pgt kernel high mapping*/ + init_top_pgt[511] = early_top_pgt[511]; x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 4e8577d03372..1f85ee8f9439 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -24,6 +24,7 @@ #include <asm/nops.h> #include <asm/bootparam.h> #include <asm/export.h> +#include <asm/pgtable_32.h> /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -41,44 +42,10 @@ #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id -/* - * This is how much memory in addition to the memory covered up to - * and including _end we need mapped initially. - * We need: - * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) - * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) - * - * Modulo rounding, each megabyte assigned here requires a kilobyte of - * memory, which is currently unreclaimed. - * - * This should be a multiple of a page. - * - * KERNEL_IMAGE_SIZE should be greater than pa(_end) - * and small than max_low_pfn, otherwise will waste some page table entries - */ - -#if PTRS_PER_PMD > 1 -#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) -#else -#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) -#endif #define SIZEOF_PTREGS 17*4 /* - * Number of possible pages in the lowmem region. - * - * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a - * gas warning about overflowing shift count when gas has been compiled - * with only a host target support using a 32-bit type for internal - * representation. - */ -LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT) - -/* Enough space to fit pagetables for the low memory linear map */ -MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT - -/* * Worst-case size of the kernel mapping we need to make: * a relocatable kernel can live anywhere in lowmem, so we need to be able * to map all of lowmem. @@ -160,90 +127,15 @@ ENTRY(startup_32) call load_ucode_bsp #endif -/* - * Initialize page tables. This creates a PDE and a set of page - * tables, which are located immediately beyond __brk_base. The variable - * _brk_end is set up to point to the first "safe" location. - * Mappings are created both at virtual address 0 (identity mapping) - * and PAGE_OFFSET for up to _end. - */ -#ifdef CONFIG_X86_PAE - - /* - * In PAE mode initial_page_table is statically defined to contain - * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 - * entries). The identity mapping is handled by pointing two PGD entries - * to the first kernel PMD. - * - * Note the upper half of each PMD or PTE are always zero at this stage. - */ - -#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ - - xorl %ebx,%ebx /* %ebx is kept at zero */ - - movl $pa(__brk_base), %edi - movl $pa(initial_pg_pmd), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ - movl %ecx,(%edx) /* Store PMD entry */ - /* Upper half already zero */ - addl $8,%edx - movl $512,%ecx -11: - stosl - xchgl %eax,%ebx - stosl - xchgl %eax,%ebx - addl $0x1000,%eax - loop 11b - - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b -1: - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) + /* Create early pagetables. */ + call mk_early_pgtbl_32 /* Do early initialization of the fixmap area */ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax +#ifdef CONFIG_X86_PAE +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) -#else /* Not PAE */ - -page_pde_offset = (__PAGE_OFFSET >> 20); - - movl $pa(__brk_base), %edi - movl $pa(initial_page_table), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ - movl %ecx,(%edx) /* Store identity PDE entry */ - movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ - addl $4,%edx - movl $1024, %ecx -11: - stosl - addl $0x1000,%eax - loop 11b - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) - - /* Do early initialization of the fixmap area */ - movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax +#else movl %eax,pa(initial_page_table+0xffc) #endif @@ -666,6 +558,7 @@ ENTRY(setup_once_ref) __PAGE_ALIGNED_BSS .align PAGE_SIZE #ifdef CONFIG_X86_PAE +.globl initial_pg_pmd initial_pg_pmd: .fill 1024*KPMDS,4,0 #else diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b467b14b03eb..6225550883df 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -37,10 +37,11 @@ * */ +#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) -L4_START_KERNEL = pgd_index(__START_KERNEL_map) +PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) +PGD_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) .text @@ -72,101 +73,12 @@ startup_64: /* Sanitize CPU configuration */ call verify_cpu - /* - * Compute the delta between the address I am compiled to run at and the - * address I am actually running at. - */ - leaq _text(%rip), %rbp - subq $_text - __START_KERNEL_map, %rbp - - /* Is the address not 2M aligned? */ - testl $~PMD_PAGE_MASK, %ebp - jnz bad_address - - /* - * Is the address too large? - */ - leaq _text(%rip), %rax - shrq $MAX_PHYSMEM_BITS, %rax - jnz bad_address - - /* - * Fixup the physical addresses in the page table - */ - addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) - - addq %rbp, level3_kernel_pgt + (510*8)(%rip) - addq %rbp, level3_kernel_pgt + (511*8)(%rip) - - addq %rbp, level2_fixmap_pgt + (506*8)(%rip) - - /* - * Set up the identity mapping for the switchover. These - * entries should *NOT* have the global bit set! This also - * creates a bunch of nonsense entries but that is fine -- - * it avoids problems around wraparound. - */ leaq _text(%rip), %rdi - leaq early_level4_pgt(%rip), %rbx - - movq %rdi, %rax - shrq $PGDIR_SHIFT, %rax - - leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx - movq %rdx, 0(%rbx,%rax,8) - movq %rdx, 8(%rbx,%rax,8) - - addq $PAGE_SIZE, %rdx - movq %rdi, %rax - shrq $PUD_SHIFT, %rax - andl $(PTRS_PER_PUD-1), %eax - movq %rdx, PAGE_SIZE(%rbx,%rax,8) - incl %eax - andl $(PTRS_PER_PUD-1), %eax - movq %rdx, PAGE_SIZE(%rbx,%rax,8) - - addq $PAGE_SIZE * 2, %rbx - movq %rdi, %rax - shrq $PMD_SHIFT, %rdi - addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax - leaq (_end - 1)(%rip), %rcx - shrq $PMD_SHIFT, %rcx - subq %rdi, %rcx - incl %ecx + pushq %rsi + call __startup_64 + popq %rsi -1: - andq $(PTRS_PER_PMD - 1), %rdi - movq %rax, (%rbx,%rdi,8) - incq %rdi - addq $PMD_SIZE, %rax - decl %ecx - jnz 1b - - test %rbp, %rbp - jz .Lskip_fixup - - /* - * Fixup the kernel text+data virtual addresses. Note that - * we might write invalid pmds, when the kernel is relocated - * cleanup_highmap() fixes this up along with the mappings - * beyond _end. - */ - leaq level2_kernel_pgt(%rip), %rdi - leaq PAGE_SIZE(%rdi), %r8 - /* See if it is a valid page table entry */ -1: testb $_PAGE_PRESENT, 0(%rdi) - jz 2f - addq %rbp, 0(%rdi) - /* Go to the next page */ -2: addq $8, %rdi - cmp %r8, %rdi - jne 1b - - /* Fixup phys_base */ - addq %rbp, phys_base(%rip) - -.Lskip_fixup: - movq $(early_level4_pgt - __START_KERNEL_map), %rax + movq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) /* @@ -186,14 +98,17 @@ ENTRY(secondary_startup_64) /* Sanitize CPU configuration */ call verify_cpu - movq $(init_level4_pgt - __START_KERNEL_map), %rax + movq $(init_top_pgt - __START_KERNEL_map), %rax 1: - /* Enable PAE mode and PGE */ + /* Enable PAE mode, PGE and LA57 */ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx +#ifdef CONFIG_X86_5LEVEL + orl $X86_CR4_LA57, %ecx +#endif movq %rcx, %cr4 - /* Setup early boot stage 4 level pagetables. */ + /* Setup early boot stage 4-/5-level pagetables. */ addq phys_base(%rip), %rax movq %rax, %cr3 @@ -269,10 +184,8 @@ ENTRY(secondary_startup_64) /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi - jmp start_cpu -ENDPROC(secondary_startup_64) -ENTRY(start_cpu) +.Ljump_to_C_code: /* * Jump to run C code and to be on a real kernel address. * Since we are running on identity-mapped space we have to jump @@ -305,7 +218,7 @@ ENTRY(start_cpu) pushq %rax # target address in negative space lretq .Lafter_lret: -ENDPROC(start_cpu) +ENDPROC(secondary_startup_64) #include "verify_cpu.S" @@ -313,11 +226,11 @@ ENDPROC(start_cpu) /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set * up already except stack. We just set up stack here. Then call - * start_secondary() via start_cpu(). + * start_secondary() via .Ljump_to_C_code. */ ENTRY(start_cpu0) movq initial_stack(%rip), %rsp - jmp start_cpu + jmp .Ljump_to_C_code ENDPROC(start_cpu0) #endif @@ -419,9 +332,13 @@ GLOBAL(name) .endr __INITDATA -NEXT_PAGE(early_level4_pgt) +NEXT_PAGE(early_top_pgt) .fill 511,8,0 +#ifdef CONFIG_X86_5LEVEL + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE +#else .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE +#endif NEXT_PAGE(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 @@ -429,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts) .data #ifndef CONFIG_XEN -NEXT_PAGE(init_level4_pgt) +NEXT_PAGE(init_top_pgt) .fill 512,8,0 #else -NEXT_PAGE(init_level4_pgt) +NEXT_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_START_KERNEL*8, 0 + .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE @@ -450,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt) PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) #endif +#ifdef CONFIG_X86_5LEVEL +NEXT_PAGE(level4_kernel_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE +#endif + NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index dc6ba5bda9fc..16f82a3aaec7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -285,7 +285,7 @@ static void hpet_legacy_clockevent_register(void) * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ - hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); + hpet_clockevent.cpumask = cpumask_of(boot_cpu_data.cpu_index); clockevents_config_and_register(&hpet_clockevent, hpet_freq, HPET_MIN_PROG_DELTA, 0x7FFFFFFF); global_clock_event = &hpet_clockevent; @@ -354,7 +354,7 @@ static int hpet_resume(struct clock_event_device *evt, int timer) irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq)); irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); - disable_irq(hdev->irq); + disable_hardirq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index be22f5a2192e..4e3b8a587c88 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -418,6 +418,7 @@ struct legacy_pic default_legacy_pic = { }; struct legacy_pic *legacy_pic = &default_legacy_pic; +EXPORT_SYMBOL(legacy_pic); static int __init i8259A_init_ops(void) { diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 589b3193f102..9c3cf0944bce 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -4,6 +4,7 @@ */ #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/errno.h> @@ -16,6 +17,7 @@ #include <linux/syscalls.h> #include <linux/bitmap.h> #include <asm/syscalls.h> +#include <asm/desc.h> /* * this changes the io permissions bitmap in the current task. @@ -45,6 +47,16 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) memset(bitmap, 0xff, IO_BITMAP_BYTES); t->io_bitmap_ptr = bitmap; set_thread_flag(TIF_IO_BITMAP); + + /* + * Now that we have an IO bitmap, we need our TSS limit to be + * correct. It's fine if we are preempted after doing this: + * with TIF_IO_BITMAP set, context switches will keep our TSS + * limit correct. + */ + preempt_disable(); + refresh_tss_limit(); + preempt_enable(); } /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7c6e9ffe4424..4aa03c5a14c9 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -264,7 +264,7 @@ void __smp_x86_platform_ipi(void) x86_platform_ipi_callback(); } -__visible void smp_x86_platform_ipi(struct pt_regs *regs) +__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -315,7 +315,7 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) } #endif -__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) +__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -394,6 +394,9 @@ int check_irq_vectors_for_cpu_disable(void) !cpumask_subset(&affinity_new, &online_new)) this_count++; } + /* No need to check any further. */ + if (!this_count) + return 0; count = 0; for_each_online_cpu(cpu) { @@ -411,8 +414,10 @@ int check_irq_vectors_for_cpu_disable(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < first_system_vector; vector++) { if (!test_bit(vector, used_vectors) && - IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) - count++; + IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { + if (++count == this_count) + return 0; + } } } @@ -427,84 +432,12 @@ int check_irq_vectors_for_cpu_disable(void) /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { - unsigned int irq, vector; - static int warned; + unsigned int irr, vector; struct irq_desc *desc; struct irq_data *data; struct irq_chip *chip; - int ret; - - for_each_irq_desc(irq, desc) { - int break_affinity = 0; - int set_affinity = 1; - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - /* interrupt's are disabled at this point */ - raw_spin_lock(&desc->lock); - - data = irq_desc_get_irq_data(desc); - affinity = irq_data_get_affinity_mask(data); - if (!irq_has_action(irq) || irqd_is_per_cpu(data) || - cpumask_subset(affinity, cpu_online_mask)) { - raw_spin_unlock(&desc->lock); - continue; - } - - /* - * Complete the irq move. This cpu is going down and for - * non intr-remapping case, we can't wait till this interrupt - * arrives at this cpu before completing the irq move. - */ - irq_force_complete_move(desc); - - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - break_affinity = 1; - affinity = cpu_online_mask; - } - chip = irq_data_get_irq_chip(data); - /* - * The interrupt descriptor might have been cleaned up - * already, but it is not yet removed from the radix tree - */ - if (!chip) { - raw_spin_unlock(&desc->lock); - continue; - } - - if (!irqd_can_move_in_process_context(data) && chip->irq_mask) - chip->irq_mask(data); - - if (chip->irq_set_affinity) { - ret = chip->irq_set_affinity(data, affinity, true); - if (ret == -ENOSPC) - pr_crit("IRQ %d set affinity failed because there are no available vectors. The device assigned to this IRQ is unstable.\n", irq); - } else { - if (!(warned++)) - set_affinity = 0; - } - - /* - * We unmask if the irq was not marked masked by the - * core code. That respects the lazy irq disable - * behaviour. - */ - if (!irqd_can_move_in_process_context(data) && - !irqd_irq_masked(data) && chip->irq_unmask) - chip->irq_unmask(data); - - raw_spin_unlock(&desc->lock); - - if (break_affinity && set_affinity) - pr_notice("Broke affinity for irq %i\n", irq); - else if (!set_affinity) - pr_notice("Cannot set affinity for irq %i\n", irq); - } + irq_migrate_all_off_this_cpu(); /* * We can remove mdelay() and then send spuriuous interrupts to @@ -523,8 +456,6 @@ void fixup_irqs(void) * nothing else will touch it. */ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irr; - if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) continue; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 6b0678a541e2..3be74fbdeff2 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -15,6 +15,7 @@ #include <linux/ftrace.h> #include <linux/uaccess.h> #include <linux/smp.h> +#include <linux/sched/task_stack.h> #include <asm/io_apic.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 3512ba607361..275487872be2 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -9,6 +9,7 @@ #include <linux/hardirq.h> #include <asm/apic.h> #include <asm/trace/irq_vectors.h> +#include <linux/interrupt.h> static inline void __smp_irq_work_interrupt(void) { @@ -16,14 +17,14 @@ static inline void __smp_irq_work_interrupt(void) irq_work_run(); } -__visible void smp_irq_work_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_irq_work_interrupt(); exiting_irq(); } -__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 1423ab1b0312..7468c6987547 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -195,7 +195,5 @@ void __init native_init_IRQ(void) if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); -#ifdef CONFIG_X86_32 irq_ctx_init(smp_processor_id()); -#endif } diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index cb9c1ed1d391..f73f475d0573 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -132,10 +132,8 @@ int sched_set_itmt_support(void) sysctl_sched_itmt_enabled = 1; - if (sysctl_sched_itmt_enabled) { - x86_topology_update = true; - rebuild_sched_domains(); - } + x86_topology_update = true; + rebuild_sched_domains(); mutex_unlock(&itmt_update_mutex); diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index fc25f698d792..ab4f491da2a9 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -32,8 +32,7 @@ static void bug_at(unsigned char *ip, int line) * Something went wrong. Crash the box, as something could be * corrupting the kernel. */ - pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n", - ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line); + pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line); BUG(); } @@ -106,11 +105,9 @@ static void __jump_label_transform(struct jump_entry *entry, void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - get_online_cpus(); mutex_lock(&text_mutex); __jump_label_transform(entry, type, NULL, 0); mutex_unlock(&text_mutex); - put_online_cpus(); } static enum { diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index bdb83e431d89..38b64587b31b 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -167,7 +167,7 @@ static int __init boot_params_kdebugfs_init(void) struct dentry *dbp, *version, *data; int error = -ENOMEM; - dbp = debugfs_create_dir("boot_params", NULL); + dbp = debugfs_create_dir("boot_params", arch_debugfs_dir); if (!dbp) return -ENOMEM; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index d0a814a9d96a..fb095ba0c02f 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -25,6 +25,7 @@ #include <asm/setup.h> #include <asm/crash.h> #include <asm/efi.h> +#include <asm/e820/api.h> #include <asm/kexec-bzimage64.h> #define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ @@ -99,15 +100,14 @@ static int setup_e820_entries(struct boot_params *params) { unsigned int nr_e820_entries; - nr_e820_entries = e820_saved->nr_map; + nr_e820_entries = e820_table_kexec->nr_entries; - /* TODO: Pass entries more than E820MAX in bootparams setup data */ - if (nr_e820_entries > E820MAX) - nr_e820_entries = E820MAX; + /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */ + if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE) + nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE; params->e820_entries = nr_e820_entries; - memcpy(¶ms->e820_map, &e820_saved->map, - nr_e820_entries * sizeof(struct e820entry)); + memcpy(¶ms->e820_table, &e820_table_kexec->entries, nr_e820_entries*sizeof(struct e820_entry)); return 0; } @@ -232,10 +232,10 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, nr_e820_entries = params->e820_entries; for (i = 0; i < nr_e820_entries; i++) { - if (params->e820_map[i].type != E820_RAM) + if (params->e820_table[i].type != E820_TYPE_RAM) continue; - start = params->e820_map[i].addr; - end = params->e820_map[i].addr + params->e820_map[i].size - 1; + start = params->e820_table[i].addr; + end = params->e820_table[i].addr + params->e820_table[i].size - 1; if ((start <= 0x100000) && end > 0x100000) { mem_k = (end >> 10) - (0x100000 >> 10); diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index c6ee63f927ab..db2182d63ed0 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -67,7 +67,7 @@ #endif /* Ensure if the instruction can be boostable */ -extern int can_boost(kprobe_opcode_t *instruction); +extern int can_boost(struct insn *insn, void *orig_addr); /* Recover instruction if given address is probed */ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr); @@ -75,7 +75,7 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, * Copy an instruction and adjust the displacement if the instruction * uses the %rip-relative addressing mode. */ -extern int __copy_instruction(u8 *dest, u8 *src); +extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn); /* Generate a relative-jump/call instruction */ extern void synthesize_reljump(void *from, void *to); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index eb3509338ae0..6b877807598b 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -45,12 +45,14 @@ #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/preempt.h> +#include <linux/sched/debug.h> #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> #include <linux/frame.h> #include <linux/kasan.h> +#include <linux/moduleloader.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -60,6 +62,7 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> +#include <asm/set_memory.h> #include "common.h" @@ -163,42 +166,38 @@ static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn) NOKPROBE_SYMBOL(skip_prefixes); /* - * Returns non-zero if opcode is boostable. + * Returns non-zero if INSN is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -int can_boost(kprobe_opcode_t *opcodes) +int can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; - kprobe_opcode_t *orig_opcodes = opcodes; - if (search_exception_tables((unsigned long)opcodes)) + if (search_exception_tables((unsigned long)addr)) return 0; /* Page fault may occur on this address. */ -retry: - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - opcode = *(opcodes++); - /* 2nd-byte opcode */ - if (opcode == 0x0f) { - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - return test_bit(*opcodes, + if (insn->opcode.nbytes == 2) + return test_bit(insn->opcode.bytes[1], (unsigned long *)twobyte_is_boostable); - } + + if (insn->opcode.nbytes != 1) + return 0; + + /* Can't boost Address-size override prefix */ + if (unlikely(inat_is_address_size_prefix(insn->attr))) + return 0; + + opcode = insn->opcode.bytes[0]; switch (opcode & 0xf0) { -#ifdef CONFIG_X86_64 - case 0x40: - goto retry; /* REX prefix is boostable */ -#endif case 0x60: - if (0x63 < opcode && opcode < 0x67) - goto retry; /* prefixes */ - /* can't boost Address-size override and bound */ - return (opcode != 0x62 && opcode != 0x67); + /* can't boost "bound" */ + return (opcode != 0x62); case 0x70: return 0; /* can't boost conditional jump */ + case 0x90: + return opcode != 0x9a; /* can't boost call far */ case 0xc0: /* can't boost software-interruptions */ return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; @@ -209,14 +208,9 @@ retry: /* can boost in/out and absolute jmps */ return ((opcode & 0x04) || opcode == 0xea); case 0xf0: - if ((opcode & 0x0c) == 0 && opcode != 0xf1) - goto retry; /* lock/rep(ne) prefix */ /* clear and set flags are boostable */ return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); default: - /* segment override prefixes are boostable */ - if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) - goto retry; /* prefixes */ /* CS override prefix and call are not boostable */ return (opcode != 0x2e && opcode != 0x9a); } @@ -263,7 +257,10 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Fortunately, we know that the original code is the ideal 5-byte * long NOP. */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (probe_kernel_read(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + if (faddr) memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); else @@ -275,7 +272,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Recover the probed instruction at addr for further analysis. * Caller must lock kprobes by kprobe_mutex, or disable preemption * for preventing to release referencing kprobes. - * Returns zero if the instruction can not get recovered. + * Returns zero if the instruction can not get recovered (or access failed). */ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) { @@ -347,37 +344,36 @@ static int is_IF_modifier(kprobe_opcode_t *insn) } /* - * Copy an instruction and adjust the displacement if the instruction - * uses the %rip-relative addressing mode. - * If it does, Return the address of the 32-bit displacement word. - * If not, return null. - * Only applicable to 64-bit x86. + * Copy an instruction with recovering modified instruction by kprobes + * and adjust the displacement if the instruction uses the %rip-relative + * addressing mode. + * This returns the length of copied instruction, or 0 if it has an error. */ -int __copy_instruction(u8 *dest, u8 *src) +int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) { - struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - int length; unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); - if (!recovered_insn) + if (!recovered_insn || !insn) return 0; - kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); - insn_get_length(&insn); - length = insn.length; + + /* This can access kernel text if given address is not recovered */ + if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE)) + return 0; + + kernel_insn_init(insn, dest, MAX_INSN_SIZE); + insn_get_length(insn); /* Another subsystem puts a breakpoint, failed to recover */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; - memcpy(dest, insn.kaddr, length); #ifdef CONFIG_X86_64 - if (insn_rip_relative(&insn)) { + /* Only x86_64 has RIP relative instructions */ + if (insn_rip_relative(insn)) { s64 newdisp; u8 *disp; - kernel_insn_init(&insn, dest, length); - insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing * mode. Adjust the displacement for the difference between @@ -390,36 +386,65 @@ int __copy_instruction(u8 *dest, u8 *src) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; + newdisp = (u8 *) src + (s64) insn->displacement.value + - (u8 *) dest; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); - pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value); + pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", + src, dest, insn->displacement.value); return 0; } - disp = (u8 *) dest + insn_offset_displacement(&insn); + disp = (u8 *) dest + insn_offset_displacement(insn); *(s32 *) disp = (s32) newdisp; } #endif - return length; + return insn->length; +} + +/* Prepare reljump right after instruction to boost */ +static void prepare_boost(struct kprobe *p, struct insn *insn) +{ + if (can_boost(insn, p->addr) && + MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + synthesize_reljump(p->ainsn.insn + insn->length, + p->addr + insn->length); + p->ainsn.boostable = true; + } else { + p->ainsn.boostable = false; + } +} + +/* Recover page to RW mode before releasing it */ +void free_insn_page(void *page) +{ + set_memory_nx((unsigned long)page & PAGE_MASK, 1); + set_memory_rw((unsigned long)page & PAGE_MASK, 1); + module_memfree(page); } static int arch_copy_kprobe(struct kprobe *p) { - int ret; + struct insn insn; + int len; + + set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1); /* Copy an instruction with recovering if other optprobe modifies it.*/ - ret = __copy_instruction(p->ainsn.insn, p->addr); - if (!ret) + len = __copy_instruction(p->ainsn.insn, p->addr, &insn); + if (!len) return -EINVAL; /* * __copy_instruction can modify the displacement of the instruction, * but it doesn't affect boostable check. */ - if (can_boost(p->ainsn.insn)) - p->ainsn.boostable = 0; - else - p->ainsn.boostable = -1; + prepare_boost(p, &insn); + + set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1); /* Check whether the instruction modifies Interrupt Flag or not */ p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); @@ -458,7 +483,7 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { - free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); + free_insn_slot(p->ainsn.insn, p->ainsn.boostable); p->ainsn.insn = NULL; } } @@ -530,7 +555,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, return; #if !defined(CONFIG_PREEMPT) - if (p->ainsn.boostable == 1 && !p->post_handler) { + if (p->ainsn.boostable && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) reset_current_kprobe(); @@ -745,7 +770,7 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) * will be the real return address, and all the rest will * point to kretprobe_trampoline. */ - hlist_for_each_entry_safe(ri, tmp, head, hlist) { + hlist_for_each_entry(ri, head, hlist) { if (ri->task != current) /* another task is sharing our hash bucket */ continue; @@ -850,7 +875,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, case 0xcf: case 0xea: /* jmp absolute -- ip is correct */ /* ip is already adjusted, no more changes required */ - p->ainsn.boostable = 1; + p->ainsn.boostable = true; goto no_change; case 0xe8: /* call relative - Fix return addr */ *tos = orig_ip + (*tos - copy_ip); @@ -875,28 +900,13 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, * jmp near and far, absolute indirect * ip is correct. And this is boostable */ - p->ainsn.boostable = 1; + p->ainsn.boostable = true; goto no_change; } default: break; } - if (p->ainsn.boostable == 0) { - if ((regs->ip > copy_ip) && - (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) { - /* - * These instructions can be executed directly if it - * jumps back to correct address. - */ - synthesize_reljump((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); - p->ainsn.boostable = 1; - } else { - p->ainsn.boostable = -1; - } - } - regs->ip += orig_ip - copy_ip; no_change: diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 5f8f0b3cc674..041f7b6dfa0f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -94,6 +94,6 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler); int arch_prepare_kprobe_ftrace(struct kprobe *p) { p->ainsn.insn = NULL; - p->ainsn.boostable = -1; + p->ainsn.boostable = false; return 0; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3d1bee9d6a72..69ea0bc1cfa3 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -28,6 +28,7 @@ #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> +#include <linux/frame.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -37,6 +38,7 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> +#include <asm/set_memory.h> #include "common.h" @@ -65,7 +67,10 @@ found: * overwritten by jump destination address. In this case, original * bytes must be recovered from op->optinsn.copied_insn buffer. */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (probe_kernel_read(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + if (addr == (unsigned long)kp->addr) { buf[0] = kp->opcode; memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); @@ -90,6 +95,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) } asm ( + "optprobe_template_func:\n" ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 @@ -127,7 +133,12 @@ asm ( " popf\n" #endif ".global optprobe_template_end\n" - "optprobe_template_end:\n"); + "optprobe_template_end:\n" + ".type optprobe_template_func, @function\n" + ".size optprobe_template_func, .-optprobe_template_func\n"); + +void optprobe_template_func(void); +STACK_FRAME_NON_STANDARD(optprobe_template_func); #define TMPL_MOVE_IDX \ ((long)&optprobe_template_val - (long)&optprobe_template_entry) @@ -174,11 +185,12 @@ NOKPROBE_SYMBOL(optimized_callback); static int copy_optimized_instructions(u8 *dest, u8 *src) { + struct insn insn; int len = 0, ret; while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len); - if (!ret || !can_boost(dest + len)) + ret = __copy_instruction(dest + len, src + len, &insn); + if (!ret || !can_boost(&insn, src + len)) return -EINVAL; len += ret; } @@ -350,6 +362,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, } buf = (u8 *)op->optinsn.insn; + set_memory_rw((unsigned long)buf & PAGE_MASK, 1); /* Copy instructions into the out-of-line buffer */ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); @@ -372,6 +385,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, (u8 *)op->kp.addr + op->optinsn.size); + set_memory_ro((unsigned long)buf & PAGE_MASK, 1); + flush_icache_range((unsigned long) buf, (unsigned long) buf + TMPL_END_IDX + op->optinsn.size + RELATIVEJUMP_SIZE); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 36bc66416021..71c17a5be983 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -161,8 +161,8 @@ void kvm_async_pf_task_wait(u32 token) */ rcu_irq_exit(); native_safe_halt(); - rcu_irq_enter(); local_irq_disable(); + rcu_irq_enter(); } } if (!n.halted) @@ -330,7 +330,12 @@ static void kvm_guest_cpu_init(void) #ifdef CONFIG_PREEMPT pa |= KVM_ASYNC_PF_SEND_ALWAYS; #endif - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); + pa |= KVM_ASYNC_PF_ENABLED; + + /* Async page fault support for L1 hypervisor is optional */ + if (wrmsr_safe(MSR_KVM_ASYNC_PF_EN, + (pa | KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT) & 0xffffffff, pa >> 32) < 0) + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); @@ -396,9 +401,9 @@ static u64 kvm_steal_clock(int cpu) src = &per_cpu(steal_time, cpu); do { version = src->version; - rmb(); + virt_rmb(); steal = src->steal; - rmb(); + virt_rmb(); } while ((version & 1) || (version != src->version)); return steal; @@ -589,7 +594,8 @@ out: local_irq_restore(flags); } -__visible bool __kvm_vcpu_is_preempted(int cpu) +#ifdef CONFIG_X86_32 +__visible bool __kvm_vcpu_is_preempted(long cpu) { struct kvm_steal_time *src = &per_cpu(steal_time, cpu); @@ -597,6 +603,29 @@ __visible bool __kvm_vcpu_is_preempted(int cpu) } PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); +#else + +#include <asm/asm-offsets.h> + +extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); + +/* + * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and + * restoring to/from the stack. + */ +asm( +".pushsection .text;" +".global __raw_callee_save___kvm_vcpu_is_preempted;" +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" +"__raw_callee_save___kvm_vcpu_is_preempted:" +"movq __per_cpu_offset(,%rdi,8), %rax;" +"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"setne %al;" +"ret;" +".popsection"); + +#endif + /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -620,18 +649,4 @@ void __init kvm_spinlock_init(void) } } -static __init int kvm_spinlock_init_jump(void) -{ - if (!kvm_para_available()) - return 0; - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) - return 0; - - static_key_slow_inc(¶virt_ticketlocks_enabled); - printk(KERN_INFO "KVM setup paravirtual spinlock\n"); - - return 0; -} -early_initcall(kvm_spinlock_init_jump); - #endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2a5cafdf8808..d88967659098 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -25,9 +25,11 @@ #include <linux/hardirq.h> #include <linux/memblock.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <asm/x86_init.h> #include <asm/reboot.h> +#include <asm/kvmclock.h> static int kvmclock __ro_after_init = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; @@ -49,6 +51,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) { return hv_clock; } +EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va); /* * The wallclock is the time of day when we booted. Since then, some time may @@ -107,12 +110,12 @@ static inline void kvm_sched_clock_init(bool stable) { if (!stable) { pv_time_ops.sched_clock = kvm_clock_read; + clear_sched_clock_stable(); return; } kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - set_sched_clock_stable(); printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", kvm_sched_clock_offset); @@ -174,13 +177,14 @@ bool kvm_check_and_clear_guest_paused(void) return ret; } -static struct clocksource kvm_clock = { +struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +EXPORT_SYMBOL_GPL(kvm_clock); int kvm_register_clock(char *txt) { diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index d4a15831ac58..a870910c8565 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -22,24 +22,25 @@ #include <asm/syscalls.h> /* context.lock is held for us, so we don't need any locking. */ -static void flush_ldt(void *current_mm) +static void flush_ldt(void *__mm) { + struct mm_struct *mm = __mm; mm_context_t *pc; - if (current->active_mm != current_mm) + if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; - pc = ¤t->active_mm->context; - set_ldt(pc->ldt->entries, pc->ldt->size); + pc = &mm->context; + set_ldt(pc->ldt->entries, pc->ldt->nr_entries); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ -static struct ldt_struct *alloc_ldt_struct(unsigned int size) +static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) { struct ldt_struct *new_ldt; unsigned int alloc_size; - if (size > LDT_ENTRIES) + if (num_entries > LDT_ENTRIES) return NULL; new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); @@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) return NULL; BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); - alloc_size = size * LDT_ENTRY_SIZE; + alloc_size = num_entries * LDT_ENTRY_SIZE; /* * Xen is very picky: it requires a page-aligned LDT that has no @@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) return NULL; } - new_ldt->size = size; + new_ldt->nr_entries = num_entries; return new_ldt; } /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { - paravirt_alloc_ldt(ldt->entries, ldt->size); + paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); } /* context.lock is held */ @@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt) if (likely(!ldt)) return; - paravirt_free_ldt(ldt->entries, ldt->size); - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) + paravirt_free_ldt(ldt->entries, ldt->nr_entries); + if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE) vfree_atomic(ldt->entries); else free_page((unsigned long)ldt->entries); @@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) goto out_unlock; } - new_ldt = alloc_ldt_struct(old_mm->context.ldt->size); + new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); if (!new_ldt) { retval = -ENOMEM; goto out_unlock; } memcpy(new_ldt->entries, old_mm->context.ldt->entries, - new_ldt->size * LDT_ENTRY_SIZE); + new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); mm->context.ldt = new_ldt; @@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm) static int read_ldt(void __user *ptr, unsigned long bytecount) { - int retval; - unsigned long size; struct mm_struct *mm = current->mm; + unsigned long entries_size; + int retval; mutex_lock(&mm->context.lock); @@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; - size = mm->context.ldt->size * LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; + entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE; + if (entries_size > bytecount) + entries_size = bytecount; - if (copy_to_user(ptr, mm->context.ldt->entries, size)) { + if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) { retval = -EFAULT; goto out_unlock; } - if (size != bytecount) { + if (entries_size != bytecount) { /* Zero-fill the rest and pretend we read bytecount bytes. */ - if (clear_user(ptr + size, bytecount - size)) { + if (clear_user(ptr + entries_size, bytecount - entries_size)) { retval = -EFAULT; goto out_unlock; } @@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; struct ldt_struct *new_ldt, *old_ldt; - unsigned int oldsize, newsize; + unsigned int old_nr_entries, new_nr_entries; struct user_desc ldt_info; struct desc_struct ldt; int error; @@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) mutex_lock(&mm->context.lock); - old_ldt = mm->context.ldt; - oldsize = old_ldt ? old_ldt->size : 0; - newsize = max(ldt_info.entry_number + 1, oldsize); + old_ldt = mm->context.ldt; + old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; + new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries); error = -ENOMEM; - new_ldt = alloc_ldt_struct(newsize); + new_ldt = alloc_ldt_struct(new_nr_entries); if (!new_ldt) goto out_unlock; if (old_ldt) - memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE); + memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE); + new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 469b23d6acc2..8c53c5d7a1bc 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,7 +23,7 @@ #include <asm/io_apic.h> #include <asm/cpufeature.h> #include <asm/desc.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/debugreg.h> static void set_idt(void *newidt, __u16 limit) @@ -103,6 +103,7 @@ static void machine_kexec_page_table_set_one( pgd_t *pgd, pmd_t *pmd, pte_t *pte, unsigned long vaddr, unsigned long paddr) { + p4d_t *p4d; pud_t *pud; pgd += pgd_index(vaddr); @@ -110,7 +111,8 @@ static void machine_kexec_page_table_set_one( if (!(pgd_val(*pgd) & _PAGE_PRESENT)) set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); #endif - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); if (!(pmd_val(*pmd) & _PAGE_PRESENT)) set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 307b1f4543de..cb0a30473c23 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -27,6 +27,7 @@ #include <asm/debugreg.h> #include <asm/kexec-bzimage64.h> #include <asm/setup.h> +#include <asm/set_memory.h> #ifdef CONFIG_KEXEC_FILE static struct kexec_file_ops *kexec_file_loaders[] = { @@ -36,6 +37,7 @@ static struct kexec_file_ops *kexec_file_loaders[] = { static void free_transition_pgtable(struct kimage *image) { + free_page((unsigned long)image->arch.p4d); free_page((unsigned long)image->arch.pud); free_page((unsigned long)image->arch.pmd); free_page((unsigned long)image->arch.pte); @@ -43,6 +45,7 @@ static void free_transition_pgtable(struct kimage *image) static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -53,13 +56,21 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); + if (!p4d) + goto err; + image->arch.p4d = p4d; + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + p4d = p4d_offset(pgd, vaddr); + if (!p4d_present(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) goto err; image->arch.pud = pud; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } - pud = pud_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); if (!pud_present(*pud)) { pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) @@ -103,7 +114,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, .context = image, - .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + .page_flag = __PAGE_KERNEL_LARGE_EXEC, }; unsigned long mstart, mend; pgd_t *level4p; @@ -112,6 +123,10 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); + + if (direct_gbpages) + info.direct_gbpages = true; + for (i = 0; i < nr_pfn_mapped; i++) { mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; @@ -194,19 +209,22 @@ static int arch_update_purgatory(struct kimage *image) /* Setup copying of backup region */ if (image->type == KEXEC_TYPE_CRASH) { - ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_dest", &image->arch.backup_load_addr, sizeof(image->arch.backup_load_addr), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_src", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_src", &image->arch.backup_src_start, sizeof(image->arch.backup_src_start), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_sz", &image->arch.backup_src_sz, sizeof(image->arch.backup_src_sz), 0); if (ret) @@ -329,7 +347,7 @@ void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_NUMBER(phys_base); - VMCOREINFO_SYMBOL(init_level4_pgt); + VMCOREINFO_SYMBOL(init_top_pgt); #ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 477ae806c2fa..f67bd3205df7 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -85,7 +85,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, + MODULES_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0f8d20497383..0d904d759ff1 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -26,7 +26,7 @@ #include <asm/io_apic.h> #include <asm/proto.h> #include <asm/bios_ebda.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/setup.h> #include <asm/smp.h> @@ -826,10 +826,10 @@ static int __init parse_alloc_mptable_opt(char *p) } early_param("alloc_mptable", parse_alloc_mptable_opt); -void __init early_reserve_e820_mpc_new(void) +void __init e820__memblock_alloc_reserved_mpc_new(void) { if (enable_update_mptable && alloc_mptable) - mpc_new_phys = early_reserve_e820(mpc_new_length, 4); + mpc_new_phys = e820__memblock_alloc_reserved(mpc_new_length, 4); } static int __init update_mp_table(void) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index bfe4d6c96fbd..446c8aa09b9b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -13,6 +13,7 @@ #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/kdebug.h> +#include <linux/sched/debug.h> #include <linux/nmi.h> #include <linux/debugfs.h> #include <linux/delay.h> @@ -20,6 +21,7 @@ #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/sched/clock.h> #if defined(CONFIG_EDAC) #include <linux/edac.h> @@ -164,11 +166,9 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) spin_lock_irqsave(&desc->lock, flags); /* - * most handlers of type NMI_UNKNOWN never return because - * they just assume the NMI is theirs. Just a sanity check - * to manage expectations + * Indicate if there are multiple registrations on the + * internal NMI handler call chains (SERR and IO_CHECK). */ - WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); @@ -222,17 +222,6 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", reason, smp_processor_id()); - /* - * On some machines, PCI SERR line is used to report memory - * errors. EDAC makes use of it. - */ -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - if (panic_on_unrecovered_nmi) nmi_panic(regs, "NMI: Not continuing"); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 6d9582ec0324..d27f8d84c4ff 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -78,7 +78,7 @@ static void __init test_nmi_ipi(struct cpumask *mask) /* Don't wait longer than a second */ timeout = USEC_PER_SEC; - while (!cpumask_empty(mask) && timeout--) + while (!cpumask_empty(mask) && --timeout) udelay(1); /* What happens if we timeout, do we still unregister?? */ diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 6d4bf812af45..8f2d1c9d43a8 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -20,7 +20,7 @@ bool pv_is_native_spin_unlock(void) __raw_callee_save___native_queued_spin_unlock; } -__visible bool __native_vcpu_is_preempted(int cpu) +__visible bool __native_vcpu_is_preempted(long cpu) { return false; } @@ -42,6 +42,3 @@ struct pv_lock_ops pv_lock_ops = { #endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); - -struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE; -EXPORT_SYMBOL(paravirt_ticketlocks_enabled); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index a1bfba0f7234..bc0a849589bb 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, - .read_cr3 = native_read_cr3, + .read_cr3 = __native_read_cr3, .write_cr3 = native_write_cr3, .flush_tlb_user = native_flush_tlb, @@ -405,9 +405,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .alloc_pte = paravirt_nop, .alloc_pmd = paravirt_nop, .alloc_pud = paravirt_nop, + .alloc_p4d = paravirt_nop, .release_pte = paravirt_nop, .release_pmd = paravirt_nop, .release_pud = paravirt_nop, + .release_p4d = paravirt_nop, .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, @@ -425,16 +427,24 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .pmd_clear = native_pmd_clear, #endif .set_pud = native_set_pud, + .set_pud_at = native_set_pud_at, .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 .pud_val = PTE_IDENT, .make_pud = PTE_IDENT, + .set_p4d = native_set_p4d, + +#if CONFIG_PGTABLE_LEVELS >= 5 + .p4d_val = PTE_IDENT, + .make_p4d = PTE_IDENT, + .set_pgd = native_set_pgd, -#endif +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ .pte_val = PTE_IDENT, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 5d400ba1349d..5286a4a92cf7 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -50,6 +50,8 @@ #include <asm/x86_init.h> #include <asm/iommu_table.h> +#define CALGARY_MAPPING_ERROR 0 + #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; #else @@ -252,7 +254,7 @@ static unsigned long iommu_range_alloc(struct device *dev, if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else - return DMA_ERROR_CODE; + return CALGARY_MAPPING_ERROR; } } @@ -272,10 +274,10 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == DMA_ERROR_CODE)) { + if (unlikely(entry == CALGARY_MAPPING_ERROR)) { pr_warn("failed to allocate %u pages in iommu %p\n", npages, tbl); - return DMA_ERROR_CODE; + return CALGARY_MAPPING_ERROR; } /* set the return dma address */ @@ -295,8 +297,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long flags; /* were we called with bad_dma_address? */ - badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); - if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { + badend = CALGARY_MAPPING_ERROR + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely(dma_addr < badend)) { WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); return; @@ -380,7 +382,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); - if (entry == DMA_ERROR_CODE) { + if (entry == CALGARY_MAPPING_ERROR) { /* makes sure unmap knows to stop */ s->dma_length = 0; goto error; @@ -398,7 +400,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, error: calgary_unmap_sg(dev, sg, nelems, dir, 0); for_each_sg(sg, s, nelems, i) { - sg->dma_address = DMA_ERROR_CODE; + sg->dma_address = CALGARY_MAPPING_ERROR; sg->dma_length = 0; } return 0; @@ -453,7 +455,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, /* set up tces to cover the allocated range */ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == DMA_ERROR_CODE) + if (mapping == CALGARY_MAPPING_ERROR) goto free; *dma_handle = mapping; return ret; @@ -478,13 +480,20 @@ static void calgary_free_coherent(struct device *dev, size_t size, free_pages((unsigned long)vaddr, get_order(size)); } -static struct dma_map_ops calgary_dma_ops = { +static int calgary_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == CALGARY_MAPPING_ERROR; +} + +static const struct dma_map_ops calgary_dma_ops = { .alloc = calgary_alloc_coherent, .free = calgary_free_coherent, .map_sg = calgary_map_sg, .unmap_sg = calgary_unmap_sg, .map_page = calgary_map_page, .unmap_page = calgary_unmap_page, + .mapping_error = calgary_mapping_error, + .dma_supported = x86_dma_supported, }; static inline void __iomem * busno_to_bbar(unsigned char num) @@ -732,7 +741,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ - iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); + iommu_range_reserve(tbl, CALGARY_MAPPING_ERROR, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ /* for CalIOC2 - avoid the entire first MB */ @@ -1007,9 +1016,8 @@ static void __init calgary_enable_translation(struct pci_dev *dev) writel(cpu_to_be32(val32), target); readl(target); /* flush */ - init_timer(&tbl->watchdog_timer); - tbl->watchdog_timer.function = &calgary_watchdog; - tbl->watchdog_timer.data = (unsigned long)dev; + setup_timer(&tbl->watchdog_timer, &calgary_watchdog, + (unsigned long)dev); mod_timer(&tbl->watchdog_timer, jiffies); } @@ -1177,7 +1185,7 @@ static int __init calgary_init(void) tbl = find_iommu_table(&dev->dev); if (translation_enabled(tbl)) - dev->dev.archdata.dma_ops = &calgary_dma_ops; + dev->dev.dma_ops = &calgary_dma_ops; } return ret; @@ -1201,7 +1209,7 @@ error: calgary_disable_translation(dev); calgary_free_bus(dev); pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ - dev->dev.archdata.dma_ops = NULL; + dev->dev.dma_ops = NULL; } while (1); return ret; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index d30c37750765..5e16d3f29594 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -17,7 +17,7 @@ static int forbid_dac __read_mostly; -struct dma_map_ops *dma_ops = &nommu_dma_ops; +const struct dma_map_ops *dma_ops = &nommu_dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -91,7 +91,8 @@ again: page = NULL; /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(flag)) { - page = dma_alloc_from_contiguous(dev, count, get_order(size)); + page = dma_alloc_from_contiguous(dev, count, get_order(size), + flag); if (page && page_to_phys(page) + size > dma_mask) { dma_release_from_contiguous(dev, page, count); page = NULL; @@ -212,10 +213,8 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -int dma_supported(struct device *dev, u64 mask) +int x86_dma_supported(struct device *dev, u64 mask) { - struct dma_map_ops *ops = get_dma_ops(dev); - #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { dev_info(dev, "PCI: Disallowing DAC for device\n"); @@ -223,9 +222,6 @@ int dma_supported(struct device *dev, u64 mask) } #endif - if (ops->dma_supported) - return ops->dma_supported(dev, mask); - /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. The caller just has to use GFP_DMA in this case. */ @@ -251,7 +247,6 @@ int dma_supported(struct device *dev, u64 mask) return 1; } -EXPORT_SYMBOL(dma_supported); static int __init pci_iommu_init(void) { diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 00e71ce396a8..a6d404087fe3 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -11,6 +11,8 @@ #include <asm/iommu.h> #include <asm/dma.h> +#define NOMMU_MAPPING_ERROR 0 + static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { @@ -33,7 +35,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, dma_addr_t bus = page_to_phys(page) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) - return DMA_ERROR_CODE; + return NOMMU_MAPPING_ERROR; flush_write_buffers(); return bus; } @@ -88,7 +90,12 @@ static void nommu_sync_sg_for_device(struct device *dev, flush_write_buffers(); } -struct dma_map_ops nommu_dma_ops = { +static int nommu_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == NOMMU_MAPPING_ERROR; +} + +const struct dma_map_ops nommu_dma_ops = { .alloc = dma_generic_alloc_coherent, .free = dma_generic_free_coherent, .map_sg = nommu_map_sg, @@ -96,4 +103,6 @@ struct dma_map_ops nommu_dma_ops = { .sync_single_for_device = nommu_sync_single_for_device, .sync_sg_for_device = nommu_sync_sg_for_device, .is_phys = 1, + .mapping_error = nommu_mapping_error, + .dma_supported = x86_dma_supported, }; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 410efb2c7b80..1e23577e17cf 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -45,7 +45,7 @@ void x86_swiotlb_free_coherent(struct device *dev, size_t size, dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } -static struct dma_map_ops swiotlb_dma_ops = { +static const struct dma_map_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc = x86_swiotlb_alloc_coherent, .free = x86_swiotlb_free_coherent, diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index da8cb987b973..587d887f7f17 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -1,6 +1,7 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/perf_event.h> #include <linux/bug.h> #include <linux/stddef.h> diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index d5f15c3f7b25..963e3fb56437 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -14,7 +14,7 @@ #include <asm/probe_roms.h> #include <asm/pci-direct.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mmzone.h> #include <asm/setup.h> #include <asm/sections.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b615a1113f58..3ca198080ea9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -7,6 +7,10 @@ #include <linux/prctl.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/idle.h> +#include <linux/sched/debug.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/init.h> #include <linux/export.h> #include <linux/pm.h> @@ -32,6 +36,8 @@ #include <asm/mce.h> #include <asm/vm86.h> #include <asm/switch_to.h> +#include <asm/desc.h> +#include <asm/prctl.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -64,6 +70,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { }; EXPORT_PER_CPU_SYMBOL(cpu_tss); +DEFINE_PER_CPU(bool, __tss_limit_invalid); +EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); + /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. @@ -116,11 +125,6 @@ void flush_thread(void) fpu__clear(&tsk->thread.fpu); } -static void hard_disable_TSC(void) -{ - cr4_set_bits(X86_CR4_TSD); -} - void disable_TSC(void) { preempt_disable(); @@ -129,15 +133,10 @@ void disable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_disable_TSC(); + cr4_set_bits(X86_CR4_TSD); preempt_enable(); } -static void hard_enable_TSC(void) -{ - cr4_clear_bits(X86_CR4_TSD); -} - static void enable_TSC(void) { preempt_disable(); @@ -146,7 +145,7 @@ static void enable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_enable_TSC(); + cr4_clear_bits(X86_CR4_TSD); preempt_enable(); } @@ -174,48 +173,129 @@ int set_tsc_mode(unsigned int val) return 0; } -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss) -{ - struct thread_struct *prev, *next; - - prev = &prev_p->thread; - next = &next_p->thread; +DEFINE_PER_CPU(u64, msr_misc_features_shadow); - if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ - test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); +static void set_cpuid_faulting(bool on) +{ + u64 msrval; - debugctl &= ~DEBUGCTLMSR_BTF; - if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) - debugctl |= DEBUGCTLMSR_BTF; + msrval = this_cpu_read(msr_misc_features_shadow); + msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; + msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); + this_cpu_write(msr_misc_features_shadow, msrval); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); +} - update_debugctlmsr(debugctl); +static void disable_cpuid(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(true); } + preempt_enable(); +} - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); +static void enable_cpuid(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(false); } + preempt_enable(); +} + +static int get_cpuid_mode(void) +{ + return !test_thread_flag(TIF_NOCPUID); +} + +static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +{ + if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) + return -ENODEV; + + if (cpuid_enabled) + enable_cpuid(); + else + disable_cpuid(); - if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + return 0; +} + +/* + * Called immediately after a successful exec. + */ +void arch_setup_new_exec(void) +{ + /* If cpuid was previously disabled for this task, re-enable it. */ + if (test_thread_flag(TIF_NOCPUID)) + enable_cpuid(); +} + +static inline void switch_to_bitmap(struct tss_struct *tss, + struct thread_struct *prev, + struct thread_struct *next, + unsigned long tifp, unsigned long tifn) +{ + if (tifn & _TIF_IO_BITMAP) { /* * Copy the relevant range of the IO bitmap. * Normally this is 128 bytes or less: */ memcpy(tss->io_bitmap, next->io_bitmap_ptr, max(prev->io_bitmap_max, next->io_bitmap_max)); - } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + /* + * Make sure that the TSS limit is correct for the CPU + * to notice the IO bitmap. + */ + refresh_tss_limit(); + } else if (tifp & _TIF_IO_BITMAP) { /* * Clear any possible leftover bits: */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + unsigned long tifp, tifn; + + prev = &prev_p->thread; + next = &next_p->thread; + + tifn = READ_ONCE(task_thread_info(next_p)->flags); + tifp = READ_ONCE(task_thread_info(prev_p)->flags); + switch_to_bitmap(tss, prev, next, tifp, tifn); + propagate_user_return_notify(prev_p, next_p); + + if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && + arch_has_block_step()) { + unsigned long debugctl, msk; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~DEBUGCTLMSR_BTF; + msk = tifn & _TIF_BLOCKSTEP; + debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + + if ((tifp ^ tifn) & _TIF_NOTSC) + cr4_toggle_bits(X86_CR4_TSD); + + if ((tifp ^ tifn) & _TIF_NOCPUID) + set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); } /* @@ -465,17 +545,6 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) } /* - * Return saved PC of a blocked thread. - * What is this good for? it will be always the scheduler or ret_from_fork. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - struct inactive_task_frame *frame = - (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp); - return READ_ONCE_NOCHECK(frame->ret_addr); -} - -/* * Called from fs/proc with a reference on @p to find the function * which called into schedule(). This needs to be done carefully * because the task might wake up and we might look at a stack @@ -536,3 +605,16 @@ out: put_task_stack(p); return ret; } + +long do_arch_prctl_common(struct task_struct *task, int option, + unsigned long cpuid_enabled) +{ + switch (option) { + case ARCH_GET_CPUID: + return get_cpuid_mode(); + case ARCH_SET_CPUID: + return set_cpuid_mode(task, cpuid_enabled); + } + + return -EINVAL; +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a0ac3e81518a..c6d6dc5f8bb2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -12,6 +12,8 @@ #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -35,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/ldt.h> @@ -54,6 +57,7 @@ #include <asm/switch_to.h> #include <asm/vm86.h> #include <asm/intel_rdt.h> +#include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) { @@ -74,7 +78,7 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip); printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags, - smp_processor_id()); + raw_smp_processor_id()); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); @@ -88,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); - cr3 = read_cr3(); + cr3 = __read_cr3(); cr4 = __read_cr4(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); @@ -302,3 +306,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(current, option, arg2); +} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a61e141b6891..c3169be4c596 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -17,6 +17,8 @@ #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -35,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -50,6 +53,11 @@ #include <asm/xen/hypervisor.h> #include <asm/vdso.h> #include <asm/intel_rdt.h> +#include <asm/unistd.h> +#ifdef CONFIG_IA32_EMULATION +/* Not included via unistd.h */ +#include <asm/unistd_32_ia32.h> +#endif __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); @@ -96,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); - cr3 = read_cr3(); + cr3 = __read_cr3(); cr4 = __read_cr4(); printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", @@ -134,7 +142,7 @@ void release_thread(struct task_struct *dead_task) pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", dead_task->comm, dead_task->mm->context.ldt->entries, - dead_task->mm->context.ldt->size); + dead_task->mm->context.ldt->nr_entries); BUG(); } #endif @@ -202,7 +210,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, tls); + err = do_arch_prctl_64(p, ARCH_SET_FS, tls); if (err) goto out; } @@ -438,7 +446,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV /* * On Xen PV, IOPL bits in pt_regs->flags have no effect, and * current_pt_regs()->flags may not match the current task's @@ -491,6 +499,8 @@ void set_personality_64bit(void) clear_thread_flag(TIF_IA32); clear_thread_flag(TIF_ADDR32); clear_thread_flag(TIF_X32); + /* Pretend that this comes from a 64bit execve */ + task_pt_regs(current)->orig_ax = __NR_execve; /* Ensure the corresponding mm is not marked. */ if (current->mm) @@ -503,32 +513,50 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -void set_personality_ia32(bool x32) +static void __set_personality_x32(void) { - /* inherit personality from parent */ +#ifdef CONFIG_X86_X32 + clear_thread_flag(TIF_IA32); + set_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_X32; + current->personality &= ~READ_IMPLIES_EXEC; + /* + * in_compat_syscall() uses the presence of the x32 syscall bit + * flag to determine compat status. The x86 mmap() code relies on + * the syscall bitness so set x32 syscall bit right here to make + * in_compat_syscall() work during exec(). + * + * Pretend to come from a x32 execve. + */ + task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; + current->thread.status &= ~TS_COMPAT; +#endif +} + +static void __set_personality_ia32(void) +{ +#ifdef CONFIG_IA32_EMULATION + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_IA32; + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + task_pt_regs(current)->orig_ax = __NR_ia32_execve; + current->thread.status |= TS_COMPAT; +#endif +} +void set_personality_ia32(bool x32) +{ /* Make sure to be in 32bit mode */ set_thread_flag(TIF_ADDR32); - /* Mark the associated mm as containing 32-bit tasks. */ - if (x32) { - clear_thread_flag(TIF_IA32); - set_thread_flag(TIF_X32); - if (current->mm) - current->mm->context.ia32_compat = TIF_X32; - current->personality &= ~READ_IMPLIES_EXEC; - /* in_compat_syscall() uses the presence of the x32 - syscall bit flag to determine compat status */ - current->thread.status &= ~TS_COMPAT; - } else { - set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_X32); - if (current->mm) - current->mm->context.ia32_compat = TIF_IA32; - current->personality |= force_personality32; - /* Prepare the first "return" to user space */ - current->thread.status |= TS_COMPAT; - } + if (x32) + __set_personality_x32(); + else + __set_personality_ia32(); } EXPORT_SYMBOL_GPL(set_personality_ia32); @@ -545,70 +573,72 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) } #endif -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; int doit = task == current; int cpu; - switch (code) { + switch (option) { case ARCH_SET_GS: - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.gsindex = 0; - task->thread.gsbase = addr; + task->thread.gsbase = arg2; if (doit) { load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2); } put_cpu(); break; case ARCH_SET_FS: /* Not strictly needed for fs, but do it for symmetry with gs */ - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.fsindex = 0; - task->thread.fsbase = addr; + task->thread.fsbase = arg2; if (doit) { /* set the selector to 0 to not confuse __switch_to */ loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, addr); + ret = wrmsrl_safe(MSR_FS_BASE, arg2); } put_cpu(); break; case ARCH_GET_FS: { unsigned long base; + if (doit) rdmsrl(MSR_FS_BASE, base); else base = task->thread.fsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { unsigned long base; + if (doit) rdmsrl(MSR_KERNEL_GS_BASE, base); else base = task->thread.gsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } #ifdef CONFIG_CHECKPOINT_RESTORE # ifdef CONFIG_X86_X32_ABI case ARCH_MAP_VDSO_X32: - return prctl_map_vdso(&vdso_image_x32, addr); + return prctl_map_vdso(&vdso_image_x32, arg2); # endif # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: - return prctl_map_vdso(&vdso_image_32, addr); + return prctl_map_vdso(&vdso_image_32, arg2); # endif case ARCH_MAP_VDSO_64: - return prctl_map_vdso(&vdso_image_64, addr); + return prctl_map_vdso(&vdso_image_64, arg2); #endif default: @@ -619,10 +649,23 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) return ret; } -long sys_arch_prctl(int code, unsigned long addr) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl(current, code, addr); + long ret; + + ret = do_arch_prctl_64(current, option, arg2); + if (ret == -EINVAL) + ret = do_arch_prctl_common(current, option, arg2); + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION +COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(current, option, arg2); } +#endif unsigned long KSTK_ESP(struct task_struct *task) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 9cc7d5a330ef..f37d18124648 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> @@ -395,12 +396,12 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; /* - * When changing the segment base, use do_arch_prctl + * When changing the segment base, use do_arch_prctl_64 * to set either thread.fs or thread.fsindex and the * corresponding GDT slot. */ if (child->thread.fsbase != value) - return do_arch_prctl(child, ARCH_SET_FS, value); + return do_arch_prctl_64(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): /* @@ -409,7 +410,7 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; if (child->thread.gsbase != value) - return do_arch_prctl(child, ARCH_SET_GS, value); + return do_arch_prctl_64(child, ARCH_SET_GS, value); return 0; #endif } @@ -868,7 +869,7 @@ long arch_ptrace(struct task_struct *child, long request, Works just like arch_prctl, except that the arguments are reversed. */ case PTRACE_ARCH_PRCTL: - ret = do_arch_prctl(child, data, addr); + ret = do_arch_prctl_64(child, data, addr); break; #endif diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 9e93fe5803b4..5c3f6d6a5078 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -21,6 +21,8 @@ #include <linux/sched.h> #include <linux/gfp.h> #include <linux/bootmem.h> +#include <linux/nmi.h> + #include <asm/fixmap.h> #include <asm/pvclock.h> diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e244c19a2451..67393fc88353 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -9,6 +9,7 @@ #include <linux/sched.h> #include <linux/tboot.h> #include <linux/delay.h> +#include <linux/frame.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -123,6 +124,7 @@ void __noreturn machine_real_restart(unsigned int type) #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); #endif +STACK_FRAME_NON_STANDARD(machine_real_restart); /* * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot @@ -223,6 +225,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, + { /* Handle problems with rebooting on ASUS EeeBook X205TA */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"), + }, + }, + { /* Handle problems with rebooting on ASUS EeeBook X205TAW */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TAW", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"), + }, + }, /* Certec */ { /* Handle problems with rebooting on Certec BPC600 */ @@ -749,10 +767,11 @@ void machine_crash_shutdown(struct pt_regs *regs) #endif +/* This is the CPU performing the emergency shutdown work. */ +int crashing_cpu = -1; + #if defined(CONFIG_SMP) -/* This keeps a track of which one is crashing cpu. */ -static int crashing_cpu; static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 2408c1603438..5ab3895516ac 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,5 +1,5 @@ #include <linux/ioport.h> -#include <asm/e820.h> +#include <asm/e820/api.h> static void resource_clip(struct resource *res, resource_size_t start, resource_size_t end) @@ -25,10 +25,10 @@ static void resource_clip(struct resource *res, resource_size_t start, static void remove_e820_regions(struct resource *avail) { int i; - struct e820entry *entry; + struct e820_entry *entry; - for (i = 0; i < e820->nr_map; i++) { - entry = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + entry = &e820_table->entries[i]; resource_clip(avail, entry->addr, entry->addr + entry->size - 1); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4cfba947d774..3486d0498800 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -70,12 +70,13 @@ #include <linux/tboot.h> #include <linux/jiffies.h> +#include <linux/usb/xhci-dbgp.h> #include <video/edid.h> #include <asm/mtrr.h> #include <asm/apic.h> #include <asm/realmode.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/efi.h> @@ -119,7 +120,7 @@ * max_low_pfn_mapped: highest direct mapped pfn under 4GB * max_pfn_mapped: highest direct mapped pfn over 4GB * - * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are * represented by pfn_mapped */ unsigned long max_low_pfn_mapped; @@ -173,14 +174,11 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -/* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data = { - .wp_works_ok = -1, -}; +/* cpu data as detected by the assembly code in head_32.S */ +struct cpuinfo_x86 new_cpu_data; + /* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data __read_mostly = { - .wp_works_ok = -1, -}; +struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); unsigned int def_to_bigsmp; @@ -426,7 +424,7 @@ static void __init parse_setup_data(void) switch (data_type) { case SETUP_E820_EXT: - parse_e820_ext(pa_data, data_len); + e820__memory_setup_extended(pa_data, data_len); break; case SETUP_DTB: add_dtb(pa_data); @@ -441,29 +439,6 @@ static void __init parse_setup_data(void) } } -static void __init e820_reserve_setup_data(void) -{ - struct setup_data *data; - u64 pa_data; - - pa_data = boot_params.hdr.setup_data; - if (!pa_data) - return; - - while (pa_data) { - data = early_memremap(pa_data, sizeof(*data)); - e820_update_range(pa_data, sizeof(*data)+data->len, - E820_RAM, E820_RESERVED_KERN); - pa_data = data->next; - early_memunmap(data, sizeof(*data)); - } - - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); - memcpy(e820_saved, e820, sizeof(struct e820map)); - printk(KERN_INFO "extended physical RAM map:\n"); - e820_print_map("reserve setup_data"); -} - static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_data *data; @@ -528,7 +503,7 @@ static int __init reserve_crashkernel_low(void) return 0; } - low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN); + low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); if (!low_base) { pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", (unsigned long)(low_size >> 20)); @@ -575,7 +550,9 @@ static void __init reserve_crashkernel(void) /* 0 means: find the address automatically */ if (crash_base <= 0) { /* - * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX + * Set CRASH_ADDR_LOW_MAX upper bound for crash memory, + * as old kexec-tools loads bzImage below that, unless + * "crashkernel=size[KMG],high" is specified. */ crash_base = memblock_find_in_range(CRASH_ALIGN, high ? CRASH_ADDR_HIGH_MAX @@ -754,16 +731,16 @@ static void __init trim_bios_range(void) * since some BIOSes are known to corrupt low memory. See the * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); /* * special case: Some BIOSen report the PC BIOS * area (640->1Mb) as ram even though it is not. * take them out. */ - e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__update_table(e820_table); } /* called before trim_bios_range() to spare extra sanitize */ @@ -773,18 +750,18 @@ static void __init e820_add_kernel_range(void) u64 size = __pa_symbol(_end) - start; /* - * Complain if .text .data and .bss are not marked as E820_RAM and + * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and * attempt to fix it by adding the range. We may have a confused BIOS, * or the user may have used memmap=exactmap or memmap=xxM$yyM to * exclude kernel range. If we really are running on top non-RAM, * we will crash later anyways. */ - if (e820_all_mapped(start, start + size, E820_RAM)) + if (e820__mapped_all(start, start + size, E820_TYPE_RAM)) return; - pr_warn(".text .data .bss are not marked as E820_RAM!\n"); - e820_remove_range(start, size, E820_RAM, 0); - e820_add_region(start, size, E820_RAM); + pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n"); + e820__range_remove(start, size, E820_TYPE_RAM, 0); + e820__range_add(start, size, E820_TYPE_RAM); } static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; @@ -835,6 +812,26 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } +static void __init simple_udelay_calibration(void) +{ + unsigned int tsc_khz, cpu_khz; + unsigned long lpj; + + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + + tsc_khz = tsc_khz ? : cpu_khz; + if (!tsc_khz) + return; + + lpj = tsc_khz * 1000; + do_div(lpj, HZ); + loops_per_jiffy = lpj; +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -937,7 +934,7 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; - setup_memory_map(); + e820__memory_setup(); parse_setup_data(); copy_edd(); @@ -1026,9 +1023,8 @@ void __init setup_arch(char **cmdline_p) early_dump_pci_devices(); #endif - /* update the e820_saved too */ - e820_reserve_setup_data(); - finish_e820_parsing(); + e820__reserve_setup_data(); + e820__finish_early_params(); if (efi_enabled(EFI_BOOT)) efi_init(); @@ -1043,6 +1039,8 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + simple_udelay_calibration(); + x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ @@ -1054,11 +1052,11 @@ void __init setup_arch(char **cmdline_p) trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { - e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, - E820_RESERVED); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM, + E820_TYPE_RESERVED); + e820__update_table(e820_table); printk(KERN_INFO "fixed physical RAM map:\n"); - e820_print_map("bad_ppro"); + e820__print_table("bad_ppro"); } #else early_gart_iommu_check(); @@ -1068,16 +1066,23 @@ void __init setup_arch(char **cmdline_p) * partially used pages are not usable - thus * we are rounding upwards: */ - max_pfn = e820_end_of_ram_pfn(); + max_pfn = e820__end_of_ram_pfn(); /* update e820 for memory not covered by WB MTRRs */ mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - max_pfn = e820_end_of_ram_pfn(); + max_pfn = e820__end_of_ram_pfn(); max_possible_pfn = max_pfn; /* + * This call is required when the CPU does not support PAT. If + * mtrr_bp_init() invoked it already via pat_init() the call has no + * effect. + */ + init_cache_modes(); + + /* * Define random base addresses for memory sections after max_pfn is * defined and before each memory section base is used. */ @@ -1092,7 +1097,7 @@ void __init setup_arch(char **cmdline_p) /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) - max_low_pfn = e820_end_of_low_ram_pfn(); + max_low_pfn = e820__end_of_low_ram_pfn(); else max_low_pfn = max_pfn; @@ -1109,7 +1114,7 @@ void __init setup_arch(char **cmdline_p) early_alloc_pgt_buf(); /* - * Need to conclude brk, before memblock_x86_fill() + * Need to conclude brk, before e820__memblock_setup() * it could use memblock_find_in_range, could overlap with * brk area. */ @@ -1118,7 +1123,10 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); memblock_set_current_limit(ISA_END_ADDRESS); - memblock_x86_fill(); + e820__memblock_setup(); + + if (!early_xdbc_setup_hardware()) + early_xdbc_register_console(); reserve_bios_regions(); @@ -1135,7 +1143,7 @@ void __init setup_arch(char **cmdline_p) } /* preallocate 4k for mptable mpc */ - early_reserve_e820_mpc_new(); + e820__memblock_alloc_reserved_mpc_new(); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); @@ -1176,6 +1184,20 @@ void __init setup_arch(char **cmdline_p) /* Allocate bigger log buffer */ setup_log_buf(1); + if (efi_enabled(EFI_BOOT)) { + switch (boot_params.secure_boot) { + case efi_secureboot_mode_disabled: + pr_info("Secure boot disabled\n"); + break; + case efi_secureboot_mode_enabled: + pr_info("Secure boot enabled\n"); + break; + default: + pr_info("Secure boot could not be determined\n"); + break; + } + } + reserve_initrd(); acpi_table_upgrade(); @@ -1259,12 +1281,12 @@ void __init setup_arch(char **cmdline_p) kvm_guest_init(); - e820_reserve_resources(); - e820_mark_nosave_regions(max_low_pfn); + e820__reserve_resources(); + e820__register_nosave_regions(max_low_pfn); x86_init.resources.reserve_resources(); - e820_setup_gap(); + e820__setup_pci_gap(); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9820d6d977c6..10edd1e69a68 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -160,7 +160,7 @@ static inline void setup_percpu_segment(int cpu) pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, 0x2 | DESCTYPE_S, 0x8); gdt.s = 1; - write_gdt_entry(get_cpu_gdt_table(cpu), + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); #endif } @@ -288,4 +288,25 @@ void __init setup_per_cpu_areas(void) /* Setup cpu initialized, callin, callout masks */ setup_cpu_local_masks(); + +#ifdef CONFIG_X86_32 + /* + * Sync back kernel address range again. We already did this in + * setup_arch(), but percpu data also needs to be available in + * the smpboot asm. We can't reliably pick up percpu mappings + * using vmalloc_fault(), because exception dispatch needs + * percpu data. + */ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); +#endif } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 763af1d0de64..cc30a74e4adb 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> @@ -845,7 +846,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index ec1f756f9dc9..71beb28600d4 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -151,8 +151,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, if (from->si_signo == SIGSEGV) { if (from->si_code == SEGV_BNDERR) { - compat_uptr_t lower = (unsigned long)&to->si_lower; - compat_uptr_t upper = (unsigned long)&to->si_upper; + compat_uptr_t lower = (unsigned long)from->si_lower; + compat_uptr_t upper = (unsigned long)from->si_upper; put_user_ex(lower, &to->si_lower); put_user_ex(upper, &to->si_upper); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 68f8cc222f25..d798c0da451c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -33,6 +33,7 @@ #include <asm/mce.h> #include <asm/trace/irq_vectors.h> #include <asm/kexec.h> +#include <asm/virtext.h> /* * Some notes on x86 processor bugs affecting SMP operation: @@ -124,7 +125,7 @@ static bool smp_no_nmi_ipi = false; static void native_smp_send_reschedule(int cpu) { if (unlikely(cpu_is_offline(cpu))) { - WARN_ON(1); + WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); return; } apic->send_IPI(cpu, RESCHEDULE_VECTOR); @@ -162,6 +163,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; + cpu_emergency_vmxoff(); stop_this_cpu(NULL); return NMI_HANDLED; @@ -174,6 +176,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) asmlinkage __visible void smp_reboot_interrupt(void) { ipi_entering_ack_irq(); + cpu_emergency_vmxoff(); stop_this_cpu(NULL); irq_exit(); } @@ -259,7 +262,7 @@ static inline void __smp_reschedule_interrupt(void) scheduler_ipi(); } -__visible void smp_reschedule_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); __smp_reschedule_interrupt(); @@ -268,7 +271,7 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs) */ } -__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* * Need to call irq_enter() before calling the trace point. @@ -292,14 +295,15 @@ static inline void __smp_call_function_interrupt(void) inc_irq_stat(irq_call_count); } -__visible void smp_call_function_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_call_function_interrupt(); exiting_irq(); } -__visible void smp_trace_call_function_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_trace_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); @@ -314,14 +318,16 @@ static inline void __smp_call_function_single_interrupt(void) inc_irq_stat(irq_call_count); } -__visible void smp_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_call_function_single_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } -__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_trace_call_function_single_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 46732dc3b73c..b474c8de7fba 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -45,6 +45,9 @@ #include <linux/smp.h> #include <linux/export.h> #include <linux/sched.h> +#include <linux/sched/topology.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/task_stack.h> #include <linux/percpu.h> #include <linux/bootmem.h> #include <linux/err.h> @@ -433,9 +436,15 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && - per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && - c->cpu_core_id == o->cpu_core_id) - return topology_sane(c, o, "smt"); + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { + if (c->cpu_core_id == o->cpu_core_id) + return topology_sane(c, o, "smt"); + + if ((c->cu_id != 0xff) && + (o->cu_id != 0xff) && + (c->cu_id == o->cu_id)) + return topology_sane(c, o, "smt"); + } } else if (c->phys_proc_id == o->phys_proc_id && c->cpu_core_id == o->cpu_core_id) { @@ -854,7 +863,7 @@ static void announce_cpu(int cpu, int apicid) if (cpu == 1) printk(KERN_INFO "x86: Booting SMP configuration:\n"); - if (system_state == SYSTEM_BOOTING) { + if (system_state < SYSTEM_RUNNING) { if (node != current_node) { if (current_node > (-1)) pr_cont("\n"); @@ -974,7 +983,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long timeout; idle->thread.sp = (unsigned long)task_pt_regs(idle); - early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; @@ -1341,8 +1350,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); - if (is_uv_system()) - uv_system_init(); + uv_system_init(); set_mtrr_aps_delayed_init(); @@ -1581,7 +1589,6 @@ void native_cpu_die(unsigned int cpu) void play_dead_common(void) { idle_task_exit(); - reset_lazy_tlbstate(); /* Ack it */ (void)cpu_report_death(); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 0653788026e2..8dabd7bf1673 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -4,6 +4,8 @@ * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <[email protected]> */ #include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/task_stack.h> #include <linux/stacktrace.h> #include <linux/export.h> #include <linux/uaccess.h> @@ -74,6 +76,101 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE + +#define STACKTRACE_DUMP_ONCE(task) ({ \ + static bool __section(.data.unlikely) __dumped; \ + \ + if (!__dumped) { \ + __dumped = true; \ + WARN_ON(1); \ + show_stack(task, NULL); \ + } \ +}) + +static int __save_stack_trace_reliable(struct stack_trace *trace, + struct task_struct *task) +{ + struct unwind_state state; + struct pt_regs *regs; + unsigned long addr; + + for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + + regs = unwind_get_entry_regs(&state); + if (regs) { + /* + * Kernel mode registers on the stack indicate an + * in-kernel interrupt or exception (e.g., preemption + * or a page fault), which can make frame pointers + * unreliable. + */ + if (!user_mode(regs)) + return -EINVAL; + + /* + * The last frame contains the user mode syscall + * pt_regs. Skip it and finish the unwind. + */ + unwind_next_frame(&state); + if (!unwind_done(&state)) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + break; + } + + addr = unwind_get_return_address(&state); + + /* + * A NULL or invalid return address probably means there's some + * generated code which __kernel_text_address() doesn't know + * about. + */ + if (!addr) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + + if (save_stack_address(trace, addr, false)) + return -EINVAL; + } + + /* Check for stack corruption */ + if (unwind_error(&state)) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; + + return 0; +} + +/* + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) +{ + int ret; + + if (!try_get_task_stack(tsk)) + return -EINVAL; + + ret = __save_stack_trace_reliable(trace, tsk); + + put_task_stack(tsk); + + return ret; +} +#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ + /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ struct stack_frame_user { @@ -136,4 +233,3 @@ void save_stack_trace_user(struct stack_trace *trace) if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } - diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index a23ce84a3f6c..5f25cfbd952e 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -2,6 +2,7 @@ * x86 single-step support code, common to 32-bit and 64-bit. */ #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/ptrace.h> #include <asm/desc.h> @@ -33,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re mutex_lock(&child->mm->context.lock); if (unlikely(!child->mm->context.ldt || - seg >= child->mm->context.ldt->size)) + seg >= child->mm->context.ldt->nr_entries)) addr = -1L; /* bogus selector, access would fault */ else { desc = &child->mm->context.ldt->entries[seg]; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index a55ed63b9f91..213ddf3e937d 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -1,5 +1,6 @@ #include <linux/errno.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/syscalls.h> #include <linux/mm.h> #include <linux/fs.h> @@ -16,6 +17,8 @@ #include <linux/uaccess.h> #include <linux/elf.h> +#include <asm/elf.h> +#include <asm/compat.h> #include <asm/ia32.h> #include <asm/syscalls.h> @@ -100,7 +103,7 @@ out: static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { - if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { + if (!in_compat_syscall() && (flags & MAP_32BIT)) { /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit it to that. This means we need to move the @@ -113,10 +116,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin, if (current->flags & PF_RANDOMIZE) { *begin = randomize_page(*begin, 0x02000000); } - } else { - *begin = current->mm->mmap_legacy_base; - *end = TASK_SIZE; + return; } + + *begin = get_mmap_base(1); + *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); } unsigned long @@ -140,7 +144,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (end - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -175,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; /* for MAP_32BIT mappings we force the legacy mmap base */ - if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) + if (!in_compat_syscall() && (flags & MAP_32BIT)) goto bottomup; /* requesting a specific address */ @@ -183,14 +187,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = mm->mmap_base; + info.high_limit = get_mmap_base(0); info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; if (filp) { diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b868fa1b812b..a4eb27918ceb 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -42,7 +42,7 @@ #include <asm/fixmap.h> #include <asm/proto.h> #include <asm/setup.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/io.h> #include "../realmode/rm/wakeup.h" @@ -68,9 +68,9 @@ void __init tboot_probe(void) * also verify that it is mapped as we expect it before calling * set_fixmap(), to reduce chance of garbage value causing crash */ - if (!e820_any_mapped(boot_params.tboot_addr, - boot_params.tboot_addr, E820_RESERVED)) { - pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + if (!e820__mapped_any(boot_params.tboot_addr, + boot_params.tboot_addr, E820_TYPE_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); return; } @@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; pgd = pgd_offset(&tboot_mm, vaddr); - pud = pud_alloc(&tboot_mm, pgd, vaddr); + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; + pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; pmd = pmd_alloc(&tboot_mm, pud, vaddr); @@ -188,12 +192,12 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; - for (i = 0; i < e820->nr_map; i++) { - if ((e820->map[i].type != E820_RAM) - && (e820->map[i].type != E820_RESERVED_KERN)) + for (i = 0; i < e820_table->nr_entries; i++) { + if ((e820_table->entries[i].type != E820_TYPE_RAM) + && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN)) continue; - add_mac_region(e820->map[i].addr, e820->map[i].size); + add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size); } tboot->acpi_sinfo.kernel_s3_resume_vector = @@ -510,6 +514,9 @@ int tboot_force_iommu(void) if (!tboot_enabled()) return 0; + if (intel_iommu_tboot_noforce) + return 1; + if (no_iommu || swiotlb || dmar_disabled) pr_warning("Forcing Intel-IOMMU to enabled\n"); diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c deleted file mode 100644 index a3b875c9e6af..000000000000 --- a/arch/x86/kernel/test_nx.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * test_nx.c: functional test for NX functionality - * - * (C) Copyright 2008 Intel Corporation - * Author: Arjan van de Ven <[email protected]> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#include <linux/module.h> -#include <linux/sort.h> -#include <linux/slab.h> - -#include <linux/uaccess.h> -#include <asm/asm.h> - -extern int rodata_test_data; - -/* - * This file checks 4 things: - * 1) Check if the stack is not executable - * 2) Check if kmalloc memory is not executable - * 3) Check if the .rodata section is not executable - * 4) Check if the .data section of a module is not executable - * - * To do this, the test code tries to execute memory in stack/kmalloc/etc, - * and then checks if the expected trap happens. - * - * Sadly, this implies having a dynamic exception handling table entry. - * ... which can be done (and will make Rusty cry)... but it can only - * be done in a stand-alone module with only 1 entry total. - * (otherwise we'd have to sort and that's just too messy) - */ - - - -/* - * We want to set up an exception handling point on our stack, - * which means a variable value. This function is rather dirty - * and walks the exception table of the module, looking for a magic - * marker and replaces it with a specific function. - */ -static void fudze_exception_table(void *marker, void *new) -{ - struct module *mod = THIS_MODULE; - struct exception_table_entry *extable; - - /* - * Note: This module has only 1 exception table entry, - * so searching and sorting is not needed. If that changes, - * this would be the place to search and re-sort the exception - * table. - */ - if (mod->num_exentries > 1) { - printk(KERN_ERR "test_nx: too many exception table entries!\n"); - printk(KERN_ERR "test_nx: test results are not reliable.\n"); - return; - } - extable = (struct exception_table_entry *)mod->extable; - extable[0].insn = (unsigned long)new; -} - - -/* - * exception tables get their symbols translated so we need - * to use a fake function to put in there, which we can then - * replace at runtime. - */ -void foo_label(void); - -/* - * returns 0 for not-executable, negative for executable - * - * Note: we cannot allow this function to be inlined, because - * that would give us more than 1 exception table entry. - * This in turn would break the assumptions above. - */ -static noinline int test_address(void *address) -{ - unsigned long result; - - /* Set up an exception table entry for our address */ - fudze_exception_table(&foo_label, address); - result = 1; - asm volatile( - "foo_label:\n" - "0: call *%[fake_code]\n" - "1:\n" - ".section .fixup,\"ax\"\n" - "2: mov %[zero], %[rslt]\n" - " ret\n" - ".previous\n" - _ASM_EXTABLE(0b,2b) - : [rslt] "=r" (result) - : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result) - ); - /* change the exception table back for the next round */ - fudze_exception_table(address, &foo_label); - - if (result) - return -ENODEV; - return 0; -} - -static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */ - -static int test_NX(void) -{ - int ret = 0; - /* 0xC3 is the opcode for "ret" */ - char stackcode[] = {0xC3, 0x90, 0 }; - char *heap; - - test_data = 0xC3; - - printk(KERN_INFO "Testing NX protection\n"); - - /* Test 1: check if the stack is not executable */ - if (test_address(&stackcode)) { - printk(KERN_ERR "test_nx: stack was executable\n"); - ret = -ENODEV; - } - - - /* Test 2: Check if the heap is executable */ - heap = kmalloc(64, GFP_KERNEL); - if (!heap) - return -ENOMEM; - heap[0] = 0xC3; /* opcode for "ret" */ - - if (test_address(heap)) { - printk(KERN_ERR "test_nx: heap was executable\n"); - ret = -ENODEV; - } - kfree(heap); - - /* - * The following 2 tests currently fail, this needs to get fixed - * Until then, don't run them to avoid too many people getting scared - * by the error message - */ - - /* Test 3: Check if the .rodata section is executable */ - if (rodata_test_data != 0xC3) { - printk(KERN_ERR "test_nx: .rodata marker has invalid value\n"); - ret = -ENODEV; - } else if (test_address(&rodata_test_data)) { - printk(KERN_ERR "test_nx: .rodata section is executable\n"); - ret = -ENODEV; - } - -#if 0 - /* Test 4: Check if the .data section of a module is executable */ - if (test_address(&test_data)) { - printk(KERN_ERR "test_nx: .data section is executable\n"); - ret = -ENODEV; - } - -#endif - return ret; -} - -static void test_exit(void) -{ -} - -module_init(test_NX); -module_exit(test_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Testcase for the NX infrastructure"); -MODULE_AUTHOR("Arjan van de Ven <[email protected]>"); diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c deleted file mode 100644 index 222e84e2432e..000000000000 --- a/arch/x86/kernel/test_rodata.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * test_rodata.c: functional test for mark_rodata_ro function - * - * (C) Copyright 2008 Intel Corporation - * Author: Arjan van de Ven <[email protected]> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#include <asm/cacheflush.h> -#include <asm/sections.h> -#include <asm/asm.h> - -int rodata_test(void) -{ - unsigned long result; - unsigned long start, end; - - /* test 1: read the value */ - /* If this test fails, some previous testrun has clobbered the state */ - if (!rodata_test_data) { - printk(KERN_ERR "rodata_test: test 1 fails (start data)\n"); - return -ENODEV; - } - - /* test 2: write to the variable; this should fault */ - /* - * If this test fails, we managed to overwrite the data - * - * This is written in assembly to be able to catch the - * exception that is supposed to happen in the correct - * case - */ - - result = 1; - asm volatile( - "0: mov %[zero],(%[rodata_test])\n" - " mov %[zero], %[rslt]\n" - "1:\n" - ".section .fixup,\"ax\"\n" - "2: jmp 1b\n" - ".previous\n" - _ASM_EXTABLE(0b,2b) - : [rslt] "=r" (result) - : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) - ); - - - if (!result) { - printk(KERN_ERR "rodata_test: test data was not read only\n"); - return -ENODEV; - } - - /* test 3: check the value hasn't changed */ - /* If this test fails, we managed to overwrite the data */ - if (!rodata_test_data) { - printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n"); - return -ENODEV; - } - /* test 4: check if the rodata section is 4Kb aligned */ - start = (unsigned long)__start_rodata; - end = (unsigned long)__end_rodata; - if (start & (PAGE_SIZE - 1)) { - printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n"); - return -ENODEV; - } - if (end & (PAGE_SIZE - 1)) { - printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n"); - return -ENODEV; - } - - return 0; -} diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index d39c09119db6..e0754cdbad37 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -66,7 +66,7 @@ static struct irqaction irq0 = { .name = "timer" }; -void __init setup_default_timer_irq(void) +static void __init setup_default_timer_irq(void) { if (!nr_legacy_irqs()) return; diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6c8934406dc9..dcd699baea1b 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -92,10 +92,17 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info) || LDT_zero(info)) + if (LDT_empty(info) || LDT_zero(info)) { desc->a = desc->b = 0; - else + } else { fill_ldt(desc, info); + + /* + * Always set the accessed bit so that the CPU + * doesn't try to write to the (read-only) GDT. + */ + desc->type |= 1; + } ++info; ++desc; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bf0c6d049080..bf54309b85da 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -29,6 +29,7 @@ #include <linux/errno.h> #include <linux/kexec.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bug.h> @@ -168,6 +169,37 @@ void ist_end_non_atomic(void) preempt_disable(); } +int is_valid_bugaddr(unsigned long addr) +{ + unsigned short ud; + + if (addr < TASK_SIZE_MAX) + return 0; + + if (probe_kernel_address((unsigned short *)addr, ud)) + return 0; + + return ud == INSN_UD0 || ud == INSN_UD2; +} + +int fixup_bug(struct pt_regs *regs, int trapnr) +{ + if (trapnr != X86_TRAP_UD) + return 0; + + switch (report_bug(regs->ip, regs)) { + case BUG_TRAP_TYPE_NONE: + case BUG_TRAP_TYPE_BUG: + break; + + case BUG_TRAP_TYPE_WARN: + regs->ip += LEN_UD0; + return 1; + } + + return 0; +} + static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, struct pt_regs *regs, long error_code) @@ -186,12 +218,15 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, } if (!user_mode(regs)) { - if (!fixup_exception(regs, trapnr)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = trapnr; - die(str, regs, error_code); - } - return 0; + if (fixup_exception(regs, trapnr)) + return 0; + + if (fixup_bug(regs, trapnr)) + return 0; + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = trapnr; + die(str, regs, error_code); } return -1; @@ -254,7 +289,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, str, regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } @@ -518,7 +553,7 @@ do_general_protection(struct pt_regs *regs, long error_code) pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } @@ -563,11 +598,9 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) * as we may switch to the interrupt stack. */ debug_stack_usage_inc(); - preempt_disable(); cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); cond_local_irq_disable(regs); - preempt_enable_no_resched(); debug_stack_usage_dec(); exit: ist_exit(regs); @@ -742,14 +775,12 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_inc(); /* It's safe to allow irq's after DR6 has been saved */ - preempt_disable(); cond_local_irq_enable(regs); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, X86_TRAP_DB); cond_local_irq_disable(regs); - preempt_enable_no_resched(); debug_stack_usage_dec(); goto exit; } @@ -769,7 +800,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); cond_local_irq_disable(regs); - preempt_enable_no_resched(); debug_stack_usage_dec(); exit: diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e41af597aed8..796d96bb0821 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -2,6 +2,7 @@ #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/init.h> #include <linux/export.h> #include <linux/timer.h> @@ -50,115 +51,34 @@ static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; struct clocksource *art_related_clocksource; -/* - * Use a ring-buffer like data structure, where a writer advances the head by - * writing a new data entry and a reader advances the tail when it observes a - * new entry. - * - * Writers are made to wait on readers until there's space to write a new - * entry. - * - * This means that we can always use an {offset, mul} pair to compute a ns - * value that is 'roughly' in the right direction, even if we're writing a new - * {offset, mul} pair during the clock read. - * - * The down-side is that we can no longer guarantee strict monotonicity anymore - * (assuming the TSC was that to begin with), because while we compute the - * intersection point of the two clock slopes and make sure the time is - * continuous at the point of switching; we can no longer guarantee a reader is - * strictly before or after the switch point. - * - * It does mean a reader no longer needs to disable IRQs in order to avoid - * CPU-Freq updates messing with his times, and similarly an NMI reader will - * no longer run the risk of hitting half-written state. - */ - struct cyc2ns { - struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ - struct cyc2ns_data *head; /* 48 + 8 = 56 */ - struct cyc2ns_data *tail; /* 56 + 8 = 64 */ -}; /* exactly fits one cacheline */ - -static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); - -struct cyc2ns_data *cyc2ns_read_begin(void) -{ - struct cyc2ns_data *head; - - preempt_disable(); - - head = this_cpu_read(cyc2ns.head); - /* - * Ensure we observe the entry when we observe the pointer to it. - * matches the wmb from cyc2ns_write_end(). - */ - smp_read_barrier_depends(); - head->__count++; - barrier(); + struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ + seqcount_t seq; /* 32 + 4 = 36 */ - return head; -} +}; /* fits one cacheline */ -void cyc2ns_read_end(struct cyc2ns_data *head) -{ - barrier(); - /* - * If we're the outer most nested read; update the tail pointer - * when we're done. This notifies possible pending writers - * that we've observed the head pointer and that the other - * entry is now free. - */ - if (!--head->__count) { - /* - * x86-TSO does not reorder writes with older reads; - * therefore once this write becomes visible to another - * cpu, we must be finished reading the cyc2ns_data. - * - * matches with cyc2ns_write_begin(). - */ - this_cpu_write(cyc2ns.tail, head); - } - preempt_enable(); -} +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -/* - * Begin writing a new @data entry for @cpu. - * - * Assumes some sort of write side lock; currently 'provided' by the assumption - * that cpufreq will call its notifiers sequentially. - */ -static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +void cyc2ns_read_begin(struct cyc2ns_data *data) { - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - struct cyc2ns_data *data = c2n->data; + int seq, idx; - if (data == c2n->head) - data++; + preempt_disable_notrace(); - /* XXX send an IPI to @cpu in order to guarantee a read? */ + do { + seq = this_cpu_read(cyc2ns.seq.sequence); + idx = seq & 1; - /* - * When we observe the tail write from cyc2ns_read_end(), - * the cpu must be done with that entry and its safe - * to start writing to it. - */ - while (c2n->tail == data) - cpu_relax(); + data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); + data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); + data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - return data; + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } -static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +void cyc2ns_read_end(void) { - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - /* - * Ensure the @data writes are visible before we publish the - * entry. Matches the data-depencency in cyc2ns_read_begin(). - */ - smp_wmb(); - - ACCESS_ONCE(c2n->head) = data; + preempt_enable_notrace(); } /* @@ -190,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data) data->cyc2ns_mul = 0; data->cyc2ns_shift = 0; data->cyc2ns_offset = 0; - data->__count = 0; } static void cyc2ns_init(int cpu) @@ -200,51 +119,29 @@ static void cyc2ns_init(int cpu) cyc2ns_data_init(&c2n->data[0]); cyc2ns_data_init(&c2n->data[1]); - c2n->head = c2n->data; - c2n->tail = c2n->data; + seqcount_init(&c2n->seq); } static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - struct cyc2ns_data *data, *tail; + struct cyc2ns_data data; unsigned long long ns; - /* - * See cyc2ns_read_*() for details; replicated in order to avoid - * an extra few instructions that came with the abstraction. - * Notable, it allows us to only do the __count and tail update - * dance when its actually needed. - */ + cyc2ns_read_begin(&data); - preempt_disable_notrace(); - data = this_cpu_read(cyc2ns.head); - tail = this_cpu_read(cyc2ns.tail); + ns = data.cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); - if (likely(data == tail)) { - ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); - } else { - data->__count++; - - barrier(); - - ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); - - barrier(); - - if (!--data->__count) - this_cpu_write(cyc2ns.tail, data); - } - preempt_enable_notrace(); + cyc2ns_read_end(); return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu) +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { - unsigned long long tsc_now, ns_now; - struct cyc2ns_data *data; + unsigned long long ns_now; + struct cyc2ns_data data; + struct cyc2ns *c2n; unsigned long flags; local_irq_save(flags); @@ -253,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) if (!khz) goto done; - data = cyc2ns_write_begin(cpu); - - tsc_now = rdtsc(); ns_now = cycles_2_ns(tsc_now); /* @@ -263,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz, + clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* @@ -272,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) * conversion algorithm shifting a 32-bit value (now specifies a 64-bit * value) - refer perf_event_mmap_page documentation in perf_event.h. */ - if (data->cyc2ns_shift == 32) { - data->cyc2ns_shift = 31; - data->cyc2ns_mul >>= 1; + if (data.cyc2ns_shift == 32) { + data.cyc2ns_shift = 31; + data.cyc2ns_mul >>= 1; } - data->cyc2ns_offset = ns_now - - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift); + data.cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift); - cyc2ns_write_end(cpu, data); + c2n = per_cpu_ptr(&cyc2ns, cpu); + + raw_write_seqcount_latch(&c2n->seq); + c2n->data[0] = data; + raw_write_seqcount_latch(&c2n->seq); + c2n->data[1] = data; done: - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); local_irq_restore(flags); } + /* * Scheduler clock - returns current time in nanosec units. */ @@ -326,9 +226,16 @@ unsigned long long sched_clock(void) { return paravirt_sched_clock(); } + +bool using_native_sched_clock(void) +{ + return pv_time_ops.sched_clock == native_sched_clock; +} #else unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); + +bool using_native_sched_clock(void) { return true; } #endif int check_tsc_unstable(void) @@ -366,6 +273,8 @@ static int __init tsc_setup(char *str) tsc_clocksource_reliable = 1; if (!strncmp(str, "noirqtime", 9)) no_sched_irq_time = 1; + if (!strcmp(str, "unstable")) + mark_tsc_unstable("boot parameter"); return 1; } @@ -978,7 +887,6 @@ void tsc_restore_sched_clock_state(void) } #ifdef CONFIG_CPU_FREQ - /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency * changes. * @@ -1019,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); - set_cyc2ns_scale(tsc_khz, freq->cpu); + set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc()); } return 0; @@ -1107,6 +1015,27 @@ static u64 read_tsc(struct clocksource *cs) return (u64)rdtsc_ordered(); } +static void tsc_cs_mark_unstable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + + tsc_unstable = 1; + if (using_native_sched_clock()) + clear_sched_clock_stable(); + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to clocksource watchdog\n"); +} + +static void tsc_cs_tick_stable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + + if (using_native_sched_clock()) + sched_clock_tick_stable(); +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ @@ -1119,22 +1048,26 @@ static struct clocksource clocksource_tsc = { CLOCK_SOURCE_MUST_VERIFY, .archdata = { .vclock_mode = VCLOCK_TSC }, .resume = tsc_resume, + .mark_unstable = tsc_cs_mark_unstable, + .tick_stable = tsc_cs_tick_stable, }; void mark_tsc_unstable(char *reason) { - if (!tsc_unstable) { - tsc_unstable = 1; + if (tsc_unstable) + return; + + tsc_unstable = 1; + if (using_native_sched_clock()) clear_sched_clock_stable(); - disable_sched_clock_irqtime(); - pr_info("Marking TSC unstable due to %s\n", reason); - /* Change only the rating, when not registered */ - if (clocksource_tsc.mult) - clocksource_mark_unstable(&clocksource_tsc); - else { - clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; - clocksource_tsc.rating = 0; - } + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to %s\n", reason); + /* Change only the rating, when not registered */ + if (clocksource_tsc.mult) { + clocksource_mark_unstable(&clocksource_tsc); + } else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; + clocksource_tsc.rating = 0; } } @@ -1232,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work) static int hpet; u64 tsc_stop, ref_stop, delta; unsigned long freq; + int cpu; /* Don't bother refining TSC on unstable systems */ if (check_tsc_unstable()) @@ -1282,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work) /* Inform the TSC deadline clockevent devices about the recalibration */ lapic_update_tsc_freq(); + /* Update the sched_clock() rate to match the clocksource one */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); + out: if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; @@ -1310,6 +1248,8 @@ static int __init init_tsc_clocksource(void) * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } @@ -1325,7 +1265,7 @@ device_initcall(init_tsc_clocksource); void __init tsc_init(void) { - u64 lpj; + u64 lpj, cyc; int cpu; if (!boot_cpu_has(X86_FEATURE_TSC)) { @@ -1356,15 +1296,19 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ + tsc_store_and_check_tsc_adjust(true); + /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) */ + cyc = rdtsc(); for_each_possible_cpu(cpu) { cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu); + set_cyc2ns_scale(tsc_khz, cpu, cyc); } if (tsc_disabled > 0) @@ -1384,12 +1328,10 @@ void __init tsc_init(void) use_tsc_delay(); + check_system_tsc_reliable(); + if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); - else - tsc_store_and_check_tsc_adjust(true); - - check_system_tsc_reliable(); detect_art(); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index d0db011051a5..7842371bc9e4 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -71,13 +71,8 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, * non zero. We don't do that on non boot cpus because physical * hotplug should have set the ADJUST register to a value > 0 so * the TSC is in sync with the already running cpus. - * - * But we always force positive ADJUST values. Otherwise the TSC - * deadline timer creates an interrupt storm. We also have to - * prevent values > 0x7FFFFFFF as those wreckage the timer as well. */ - if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) || - (bootval > 0x7FFFFFFF)) { + if (bootcpu && bootval != 0) { pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, bootval); wrmsrl(MSR_IA32_TSC_ADJUST, 0); @@ -286,13 +281,6 @@ void check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; - if (tsc_clocksource_reliable) { - if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) - pr_info( - "Skipped synchronization checks as TSC is reliable.\n"); - return; - } - /* * Set the maximum number of test runs to * 1 if the CPU does not provide the TSC_ADJUST MSR @@ -380,14 +368,19 @@ void check_tsc_sync_target(void) int cpus = 2; /* Also aborts if there is no TSC. */ - if (unsynchronized_tsc() || tsc_clocksource_reliable) + if (unsynchronized_tsc()) return; /* * Store, verify and sanitize the TSC adjust register. If * successful skip the test. + * + * The test is also skipped when the TSC is marked reliable. This + * is true for SoCs which have no fallback clocksource. On these + * SoCs the TSC is frequency synchronized, but still the TSC ADJUST + * register might have been wreckaged by the BIOS.. */ - if (tsc_store_and_check_tsc_adjust(false)) { + if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) { atomic_inc(&skip_test); return; } @@ -453,20 +446,6 @@ retry: */ cur->adjusted += cur_max_warp; - /* - * TSC deadline timer stops working or creates an interrupt storm - * with adjust values < 0 and > x07ffffff. - * - * To allow adjust values > 0x7FFFFFFF we need to disable the - * deadline timer and use the local APIC timer, but that requires - * more intrusive changes and we do not have any useful information - * from Intel about the underlying HW wreckage yet. - */ - if (cur->adjusted < 0) - cur->adjusted = 0; - if (cur->adjusted > 0x7FFFFFFF) - cur->adjusted = 0x7FFFFFFF; - pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", cpu, cur_max_warp, cur->adjusted); diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 23d15565d02a..b9389d72b2f7 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -1,4 +1,8 @@ #include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/interrupt.h> +#include <asm/sections.h> #include <asm/ptrace.h> #include <asm/bitops.h> #include <asm/stacktrace.h> @@ -21,53 +25,53 @@ val; \ }) -static void unwind_dump(struct unwind_state *state, unsigned long *sp) +static void unwind_dump(struct unwind_state *state) { static bool dumped_before = false; bool prev_zero, zero = false; - unsigned long word; + unsigned long word, *sp; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; if (dumped_before) return; dumped_before = true; - printk_deferred("unwind stack type:%d next_sp:%p mask:%lx graph_idx:%d\n", + printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n", state->stack_info.type, state->stack_info.next_sp, state->stack_mask, state->graph_idx); - for (sp = state->orig_sp; sp < state->stack_info.end; sp++) { - word = READ_ONCE_NOCHECK(*sp); + for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) + break; - prev_zero = zero; - zero = word == 0; + for (; sp < stack_info.end; sp++) { - if (zero) { - if (!prev_zero) - printk_deferred("%p: %016x ...\n", sp, 0); - continue; - } + word = READ_ONCE_NOCHECK(*sp); + + prev_zero = zero; + zero = word == 0; - printk_deferred("%p: %016lx (%pB)\n", sp, word, (void *)word); + if (zero) { + if (!prev_zero) + printk_deferred("%p: %0*x ...\n", + sp, BITS_PER_LONG/4, 0); + continue; + } + + printk_deferred("%p: %0*lx (%pB)\n", + sp, BITS_PER_LONG/4, word, (void *)word); + } } } unsigned long unwind_get_return_address(struct unwind_state *state) { - unsigned long addr; - unsigned long *addr_p = unwind_get_return_address_ptr(state); - if (unwind_done(state)) return 0; - if (state->regs && user_mode(state->regs)) - return 0; - - addr = READ_ONCE_TASK_STACK(state->task, *addr_p); - addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, - addr_p); - - return __kernel_text_address(addr) ? addr : 0; + return __kernel_text_address(state->ip) ? state->ip : 0; } EXPORT_SYMBOL_GPL(unwind_get_return_address); @@ -80,19 +84,99 @@ static size_t regs_size(struct pt_regs *regs) return sizeof(*regs); } -static bool is_last_task_frame(struct unwind_state *state) +static bool in_entry_code(unsigned long ip) +{ + char *addr = (char *)ip; + + if (addr >= __entry_text_start && addr < __entry_text_end) + return true; + +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) + if (addr >= __irqentry_text_start && addr < __irqentry_text_end) + return true; +#endif + + return false; +} + +static inline unsigned long *last_frame(struct unwind_state *state) { - unsigned long bp = (unsigned long)state->bp; - unsigned long regs = (unsigned long)task_pt_regs(state->task); + return (unsigned long *)task_pt_regs(state->task) - 2; +} + +static bool is_last_frame(struct unwind_state *state) +{ + return state->bp == last_frame(state); +} + +#ifdef CONFIG_X86_32 +#define GCC_REALIGN_WORDS 3 +#else +#define GCC_REALIGN_WORDS 1 +#endif + +static inline unsigned long *last_aligned_frame(struct unwind_state *state) +{ + return last_frame(state) - GCC_REALIGN_WORDS; +} + +static bool is_last_aligned_frame(struct unwind_state *state) +{ + unsigned long *last_bp = last_frame(state); + unsigned long *aligned_bp = last_aligned_frame(state); /* - * We have to check for the last task frame at two different locations - * because gcc can occasionally decide to realign the stack pointer and - * change the offset of the stack frame by a word in the prologue of a - * function called by head/entry code. + * GCC can occasionally decide to realign the stack pointer and change + * the offset of the stack frame in the prologue of a function called + * by head/entry code. Examples: + * + * <start_secondary>: + * push %edi + * lea 0x8(%esp),%edi + * and $0xfffffff8,%esp + * pushl -0x4(%edi) + * push %ebp + * mov %esp,%ebp + * + * <x86_64_start_kernel>: + * lea 0x8(%rsp),%r10 + * and $0xfffffffffffffff0,%rsp + * pushq -0x8(%r10) + * push %rbp + * mov %rsp,%rbp + * + * After aligning the stack, it pushes a duplicate copy of the return + * address before pushing the frame pointer. + */ + return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1)); +} + +static bool is_last_ftrace_frame(struct unwind_state *state) +{ + unsigned long *last_bp = last_frame(state); + unsigned long *last_ftrace_bp = last_bp - 3; + + /* + * When unwinding from an ftrace handler of a function called by entry + * code, the stack layout of the last frame is: + * + * bp + * parent ret addr + * bp + * function ret addr + * parent ret addr + * pt_regs + * ----------------- */ - return bp == regs - FRAME_HEADER_SIZE || - bp == regs - FRAME_HEADER_SIZE - sizeof(long); + return (state->bp == last_ftrace_bp && + *state->bp == *(state->bp + 2) && + *(state->bp + 1) == *(state->bp + 4)); +} + +static bool is_last_task_frame(struct unwind_state *state) +{ + return is_last_frame(state) || is_last_aligned_frame(state) || + is_last_ftrace_frame(state); } /* @@ -109,26 +193,70 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) return (struct pt_regs *)(regs & ~0x1); } -static bool update_stack_state(struct unwind_state *state, void *addr, - size_t len) +static bool update_stack_state(struct unwind_state *state, + unsigned long *next_bp) { struct stack_info *info = &state->stack_info; - enum stack_type orig_type = info->type; + enum stack_type prev_type = info->type; + struct pt_regs *regs; + unsigned long *frame, *prev_frame_end, *addr_p, addr; + size_t len; + + if (state->regs) + prev_frame_end = (void *)state->regs + regs_size(state->regs); + else + prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE; + + /* Is the next frame pointer an encoded pointer to pt_regs? */ + regs = decode_frame_pointer(next_bp); + if (regs) { + frame = (unsigned long *)regs; + len = regs_size(regs); + state->got_irq = true; + } else { + frame = next_bp; + len = FRAME_HEADER_SIZE; + } /* - * If addr isn't on the current stack, switch to the next one. + * If the next bp isn't on the current stack, switch to the next one. * * We may have to traverse multiple stacks to deal with the possibility - * that 'info->next_sp' could point to an empty stack and 'addr' could - * be on a subsequent stack. + * that info->next_sp could point to an empty stack and the next bp + * could be on a subsequent stack. */ - while (!on_stack(info, addr, len)) + while (!on_stack(info, frame, len)) if (get_stack_info(info->next_sp, state->task, info, &state->stack_mask)) return false; - if (!state->orig_sp || info->type != orig_type) - state->orig_sp = addr; + /* Make sure it only unwinds up and doesn't overlap the prev frame: */ + if (state->orig_sp && state->stack_info.type == prev_type && + frame < prev_frame_end) + return false; + + /* Move state to the next frame: */ + if (regs) { + state->regs = regs; + state->bp = NULL; + } else { + state->bp = next_bp; + state->regs = NULL; + } + + /* Save the return address: */ + if (state->regs && user_mode(state->regs)) + state->ip = 0; + else { + addr_p = unwind_get_return_address_ptr(state); + addr = READ_ONCE_TASK_STACK(state->task, *addr_p); + state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + addr, addr_p); + } + + /* Save the original stack pointer for unwind_dump(): */ + if (!state->orig_sp) + state->orig_sp = frame; return true; } @@ -136,14 +264,12 @@ static bool update_stack_state(struct unwind_state *state, void *addr, bool unwind_next_frame(struct unwind_state *state) { struct pt_regs *regs; - unsigned long *next_bp, *next_frame; - size_t next_len; - enum stack_type prev_type = state->stack_info.type; + unsigned long *next_bp; if (unwind_done(state)) return false; - /* have we reached the end? */ + /* Have we reached the end? */ if (state->regs && user_mode(state->regs)) goto the_end; @@ -171,58 +297,25 @@ bool unwind_next_frame(struct unwind_state *state) */ state->regs = regs; state->bp = NULL; + state->ip = 0; return true; } - /* get the next frame pointer */ + /* Get the next frame pointer: */ if (state->regs) next_bp = (unsigned long *)state->regs->bp; else - next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp); - - /* is the next frame pointer an encoded pointer to pt_regs? */ - regs = decode_frame_pointer(next_bp); - if (regs) { - next_frame = (unsigned long *)regs; - next_len = sizeof(*regs); - } else { - next_frame = next_bp; - next_len = FRAME_HEADER_SIZE; - } - - /* make sure the next frame's data is accessible */ - if (!update_stack_state(state, next_frame, next_len)) { - /* - * Don't warn on bad regs->bp. An interrupt in entry code - * might cause a false positive warning. - */ - if (state->regs) - goto the_end; + next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp); + /* Move to the next frame if it's safe: */ + if (!update_stack_state(state, next_bp)) goto bad_address; - } - - /* Make sure it only unwinds up and doesn't overlap the last frame: */ - if (state->stack_info.type == prev_type) { - if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs)) - goto bad_address; - - if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE) - goto bad_address; - } - - /* move to the next frame */ - if (regs) { - state->regs = regs; - state->bp = NULL; - } else { - state->bp = next_bp; - state->regs = NULL; - } return true; bad_address: + state->error = true; + /* * When unwinding a non-current task, the task might actually be * running on another CPU, in which case it could be modifying its @@ -233,18 +326,29 @@ bad_address: if (state->task != current) goto the_end; + /* + * Don't warn if the unwinder got lost due to an interrupt in entry + * code or in the C handler before the first frame pointer got set up: + */ + if (state->got_irq && in_entry_code(state->ip)) + goto the_end; + if (state->regs && + state->regs->sp >= (unsigned long)last_aligned_frame(state) && + state->regs->sp < (unsigned long)task_pt_regs(state->task)) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", state->regs, state->task->comm, - state->task->pid, next_frame); - unwind_dump(state, (unsigned long *)state->regs); + state->task->pid, next_bp); + unwind_dump(state); } else { printk_deferred_once(KERN_WARNING "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", state->bp, state->task->comm, - state->task->pid, next_frame); - unwind_dump(state, state->bp); + state->task->pid, next_bp); + unwind_dump(state); } the_end: state->stack_info.type = STACK_TYPE_UNKNOWN; @@ -255,35 +359,24 @@ EXPORT_SYMBOL_GPL(unwind_next_frame); void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { - unsigned long *bp, *frame; - size_t len; + unsigned long *bp; memset(state, 0, sizeof(*state)); state->task = task; + state->got_irq = (regs); - /* don't even attempt to start from user mode regs */ + /* Don't even attempt to start from user mode regs: */ if (regs && user_mode(regs)) { state->stack_info.type = STACK_TYPE_UNKNOWN; return; } - /* set up the starting stack frame */ bp = get_frame_pointer(task, regs); - regs = decode_frame_pointer(bp); - if (regs) { - state->regs = regs; - frame = (unsigned long *)regs; - len = sizeof(*regs); - } else { - state->bp = bp; - frame = bp; - len = FRAME_HEADER_SIZE; - } - /* initialize stack info and make sure the frame data is accessible */ - get_stack_info(frame, state->task, &state->stack_info, + /* Initialize stack info and make sure the frame data is accessible: */ + get_stack_info(bp, state->task, &state->stack_info, &state->stack_mask); - update_stack_state(state, frame, len); + update_stack_state(state, bp); /* * The caller can provide the address of the first frame directly diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 22881ddcbb9f..039f36738e49 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -34,7 +34,7 @@ bool unwind_next_frame(struct unwind_state *state) return true; } - state->sp = info->next_sp; + state->sp = PTR_ALIGN(info->next_sp, sizeof(long)); } while (!get_stack_info(state->sp, state->task, info, &state->stack_mask)); @@ -49,7 +49,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, memset(state, 0, sizeof(*state)); state->task = task; - state->sp = first_frame; + state->sp = PTR_ALIGN(first_frame, sizeof(long)); get_stack_info(first_frame, state->task, &state->stack_info, &state->stack_mask); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index ec5d7545e6dc..7924a5356c8a 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -35,6 +35,7 @@ #include <linux/interrupt.h> #include <linux/syscalls.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/string.h> @@ -160,24 +161,29 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) static void mark_screen_rdonly(struct mm_struct *mm) { + struct vm_area_struct *vma; + spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; - spinlock_t *ptl; int i; down_write(&mm->mmap_sem); pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; - pud = pud_offset(pgd, 0xA0000); + p4d = p4d_offset(pgd, 0xA0000); + if (p4d_none_or_clear_bad(p4d)) + goto out; + pud = pud_offset(p4d, 0xA0000); if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); if (pmd_trans_huge(*pmd)) { - struct vm_area_struct *vma = find_vma(mm, 0xA0000); + vma = find_vma(mm, 0xA0000); split_huge_pmd(vma, pmd, 0xA0000); } if (pmd_none_or_clear_bad(pmd)) @@ -191,7 +197,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pte_unmap_unlock(pte, ptl); out: up_write(&mm->mmap_sem); - flush_tlb(); + flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e79f15f108a8..c8a3b61be0aa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -146,6 +146,7 @@ SECTIONS _edata = .; } :data + BUG_TABLE . = ALIGN(PAGE_SIZE); __vvar_page = .; @@ -345,7 +346,6 @@ SECTIONS DISCARDS /DISCARD/ : { *(.eh_frame) - *(__func_stack_frame_non_standard) } } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 11a93f005268..a088b2c47f73 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,7 +14,7 @@ #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/time.h> #include <asm/irq.h> #include <asm/io_apic.h> @@ -38,7 +38,7 @@ struct x86_init_ops x86_init __initdata = { .resources = { .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, - .memory_setup = default_machine_specific_memory_setup, + .memory_setup = e820__memory_setup_default, }, .mpparse = { |