diff options
Diffstat (limited to 'arch/x86/kernel')
51 files changed, 1965 insertions, 1444 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index b2879cc23db4..6bb680671088 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -37,6 +37,7 @@ #include <linux/pci.h> #include <linux/efi-bgrt.h> +#include <asm/e820/api.h> #include <asm/irqdomain.h> #include <asm/pci_x86.h> #include <asm/pgtable.h> @@ -1564,12 +1565,6 @@ int __init early_acpi_boot_init(void) return 0; } -static int __init acpi_parse_bgrt(struct acpi_table_header *table) -{ - efi_bgrt_init(table); - return 0; -} - int __init acpi_boot_init(void) { /* those are executed after early-quirks are executed */ @@ -1729,6 +1724,6 @@ int __acpi_release_global_lock(unsigned int *lock) void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) { - e820_add_region(addr, size, E820_ACPI); - update_e820(); + e820__range_add(addr, size, E820_TYPE_ACPI); + e820__update_table_print(); } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 0a2bb1f62e72..ef2859f9fcce 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -21,7 +21,7 @@ #include <linux/pci.h> #include <linux/bitops.h> #include <linux/suspend.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/io.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -306,13 +306,13 @@ void __init early_gart_iommu_check(void) fix = 1; if (gart_fix_e820 && !fix && aper_enabled) { - if (e820_any_mapped(aper_base, aper_base + aper_size, - E820_RAM)) { + if (e820__mapped_any(aper_base, aper_base + aper_size, + E820_TYPE_RAM)) { /* reserve it, so we can reuse it in second kernel */ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", aper_base, aper_base + aper_size - 1); - e820_add_region(aper_base, aper_size, E820_RESERVED); - update_e820(); + e820__range_add(aper_base, aper_size, E820_TYPE_RESERVED); + e820__update_table_print(); } } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8ccb7ef512e0..2d75faf743f2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -731,8 +731,10 @@ static int __init calibrate_APIC_clock(void) TICK_NSEC, lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFF; lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; return 0; } @@ -778,8 +780,10 @@ static int __init calibrate_APIC_clock(void) lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; @@ -1789,8 +1793,8 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; /* - * acpi lapic path already maps that address in - * acpi_register_lapic_address() + * If the system has ACPI MADT tables or MP info, the LAPIC + * address is already registered. */ if (!acpi_lapic && !smp_found_config) register_lapic_address(apic_phys); @@ -2237,7 +2241,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) static void __init apic_bsp_up_setup(void) { #ifdef CONFIG_X86_64 - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); + apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); #else /* * Hack: In case of kdump, after a crash, kernel might be booting @@ -2627,7 +2631,7 @@ static int __init lapic_insert_resource(void) } /* - * need call insert after e820_reserve_resources() + * need call insert after e820__reserve_resources() * that is using request_resource */ late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index b109e4389c92..2262eb6df796 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -26,7 +26,7 @@ #include <linux/interrupt.h> #include <asm/acpi.h> -#include <asm/e820.h> +#include <asm/e820/api.h> static void noop_init_apic_ldr(void) { } static void noop_send_IPI(int cpu, int vector) { } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index c48264e202fd..2e8f7f048f4f 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -25,7 +25,7 @@ #include <linux/interrupt.h> #include <asm/acpi.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e9f8f8cdd570..b487b3a01615 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -34,6 +34,7 @@ #include <asm/uv/bios.h> #include <asm/uv/uv.h> #include <asm/apic.h> +#include <asm/e820/api.h> #include <asm/ipi.h> #include <asm/smp.h> #include <asm/x86_init.h> @@ -1105,7 +1106,8 @@ void __init uv_init_hub_info(struct uv_hub_info_s *hi) node_id.v = uv_read_local_mmr(UVH_NODE_ID); uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val); hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; - hi->gnode_upper = (unsigned long)hi->gnode_extra << mn.m_val; + if (mn.m_val) + hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val; if (uv_gp_table) { hi->global_mmr_base = uv_gp_table->mmr_base; diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 43955ee6715b..44207b71fee1 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -3,7 +3,7 @@ #include <linux/sched/clock.h> #include <asm/cpufeature.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 063197771b8d..dfa90a3a5145 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -90,16 +90,12 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) return; } - if (ring3mwait_disabled) { - msr_clear_bit(MSR_MISC_FEATURE_ENABLES, - MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); + if (ring3mwait_disabled) return; - } - - msr_set_bit(MSR_MISC_FEATURE_ENABLES, - MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); set_cpu_cap(c, X86_FEATURE_RING3MWAIT); + this_cpu_or(msr_misc_features_shadow, + 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT); if (c == &boot_cpu_data) ELF_HWCAP2 |= HWCAP2_RING3MWAIT; @@ -488,6 +484,34 @@ static void intel_bsp_resume(struct cpuinfo_x86 *c) init_intel_energy_perf(c); } +static void init_cpuid_fault(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) + set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); + } +} + +static void init_intel_misc_features(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + return; + + /* Clear all MISC features */ + this_cpu_write(msr_misc_features_shadow, 0); + + /* Check features and update capabilities and shadow control bits */ + init_cpuid_fault(c); + probe_xeon_phi_r3mwait(c); + + msr = this_cpu_read(msr_misc_features_shadow); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); +} + static void init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -602,7 +626,7 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_energy_perf(c); - probe_xeon_phi_r3mwait(c); + init_intel_misc_features(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5a533fefefa0..5b366462f579 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -32,55 +32,98 @@ #include <asm/intel-family.h> #include <asm/intel_rdt.h> +#define MAX_MBA_BW 100u +#define MBA_IS_LINEAR 0x4 + /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * Used to store the max resource name width and max resource data width + * to display the schemata in a tabular format + */ +int max_name_width, max_data_width; + +static void +mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); + #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { { - .name = "L3", - .domains = domain_init(RDT_RESOURCE_L3), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 1, - .cbm_idx_offset = 0 + .name = "L3", + .domains = domain_init(RDT_RESOURCE_L3), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + }, + { + .name = "L3DATA", + .domains = domain_init(RDT_RESOURCE_L3DATA), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L3DATA", - .domains = domain_init(RDT_RESOURCE_L3DATA), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 2, - .cbm_idx_offset = 0 + .name = "L3CODE", + .domains = domain_init(RDT_RESOURCE_L3CODE), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 1, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L3CODE", - .domains = domain_init(RDT_RESOURCE_L3CODE), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 2, - .cbm_idx_offset = 1 + .name = "L2", + .domains = domain_init(RDT_RESOURCE_L2), + .msr_base = IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L2", - .domains = domain_init(RDT_RESOURCE_L2), - .msr_base = IA32_L2_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 2, - .cbm_idx_multi = 1, - .cbm_idx_offset = 0 + .name = "MB", + .domains = domain_init(RDT_RESOURCE_MBA), + .msr_base = IA32_MBA_THRTL_BASE, + .msr_update = mba_wrmsr, + .cache_level = 3, + .parse_ctrlval = parse_bw, + .format_str = "%d=%*d", }, }; -static int cbm_idx(struct rdt_resource *r, int closid) +static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) { - return closid * r->cbm_idx_multi + r->cbm_idx_offset; + return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset; } /* @@ -118,9 +161,9 @@ static inline bool cache_alloc_hsw_probe(void) return false; r->num_closid = 4; - r->cbm_len = 20; - r->max_cbm = max_cbm; - r->min_cbm_bits = 2; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.min_cbm_bits = 2; r->capable = true; r->enabled = true; @@ -130,16 +173,66 @@ static inline bool cache_alloc_hsw_probe(void) return false; } -static void rdt_get_config(int idx, struct rdt_resource *r) +/* + * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values + * exposed to user interface and the h/w understandable delay values. + * + * The non-linear delay values have the granularity of power of two + * and also the h/w does not guarantee a curve for configured delay + * values vs. actual b/w enforced. + * Hence we need a mapping that is pre calibrated so the user can + * express the memory b/w as a percentage value. + */ +static inline bool rdt_get_mb_table(struct rdt_resource *r) +{ + /* + * There are no Intel SKUs as of now to support non-linear delay. + */ + pr_info("MBA b/w map not implemented for cpu:%d, model:%d", + boot_cpu_data.x86, boot_cpu_data.x86_model); + + return false; +} + +static bool rdt_get_mem_config(struct rdt_resource *r) +{ + union cpuid_0x10_3_eax eax; + union cpuid_0x10_x_edx edx; + u32 ebx, ecx; + + cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + r->membw.max_delay = eax.split.max_delay + 1; + r->default_ctrl = MAX_MBA_BW; + if (ecx & MBA_IS_LINEAR) { + r->membw.delay_linear = true; + r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay; + r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay; + } else { + if (!rdt_get_mb_table(r)) + return false; + } + r->data_width = 3; + rdt_get_mba_infofile(r); + + r->capable = true; + r->enabled = true; + + return true; +} + +static void rdt_get_cache_config(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; - union cpuid_0x10_1_edx edx; + union cpuid_0x10_x_edx edx; u32 ebx, ecx; cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); r->num_closid = edx.split.cos_max + 1; - r->cbm_len = eax.split.cbm_len + 1; - r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.cbm_len = eax.split.cbm_len + 1; + r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->data_width = (r->cache.cbm_len + 3) / 4; + rdt_get_cache_infofile(r); r->capable = true; r->enabled = true; } @@ -150,8 +243,9 @@ static void rdt_get_cdp_l3_config(int type) struct rdt_resource *r = &rdt_resources_all[type]; r->num_closid = r_l3->num_closid / 2; - r->cbm_len = r_l3->cbm_len; - r->max_cbm = r_l3->max_cbm; + r->cache.cbm_len = r_l3->cache.cbm_len; + r->default_ctrl = r_l3->default_ctrl; + r->data_width = (r->cache.cbm_len + 3) / 4; r->capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter @@ -160,33 +254,6 @@ static void rdt_get_cdp_l3_config(int type) r->enabled = false; } -static inline bool get_rdt_resources(void) -{ - bool ret = false; - - if (cache_alloc_hsw_probe()) - return true; - - if (!boot_cpu_has(X86_FEATURE_RDT_A)) - return false; - - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { - rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); - rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); - } - ret = true; - } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { - /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); - ret = true; - } - - return ret; -} - static int get_cache_id(int cpu, int level) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); @@ -200,29 +267,55 @@ static int get_cache_id(int cpu, int level) return -1; } -void rdt_cbm_update(void *arg) +/* + * Map the memory b/w percentage value to delay values + * that can be written to QOS_MSRs. + * There are currently no SKUs which support non linear delay values. + */ +static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { - struct msr_param *m = (struct msr_param *)arg; + if (r->membw.delay_linear) + return MAX_MBA_BW - bw; + + pr_warn_once("Non Linear delay-bw map not supported but queried\n"); + return r->default_ctrl; +} + +static void +mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + /* Write the delay values for mba. */ + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r)); +} + +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); +} + +void rdt_ctrl_update(void *arg) +{ + struct msr_param *m = arg; struct rdt_resource *r = m->res; - int i, cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct rdt_domain *d; list_for_each_entry(d, &r->domains, list) { /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) - goto found; + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + r->msr_update(d, m, r); + return; + } } - pr_info_once("cpu %d not found in any domain for resource %s\n", + pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); - - return; - -found: - for (i = m->low; i < m->high; i++) { - int idx = cbm_idx(r, i); - - wrmsrl(r->msr_base + idx, d->cbm[i]); - } } /* @@ -258,6 +351,32 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, return NULL; } +static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) +{ + struct msr_param m; + u32 *dc; + int i; + + dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); + if (!dc) + return -ENOMEM; + + d->ctrl_val = dc; + + /* + * Initialize the Control MSRs to having no control. + * For Cache Allocation: Set all bits in cbm + * For Memory Allocation: Set b/w requested to 100 + */ + for (i = 0; i < r->num_closid; i++, dc++) + *dc = r->default_ctrl; + + m.low = 0; + m.high = r->num_closid; + r->msr_update(d, &m, r); + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -273,7 +392,7 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, */ static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int i, id = get_cache_id(cpu, r->cache_level); + int id = get_cache_id(cpu, r->cache_level); struct list_head *add_pos = NULL; struct rdt_domain *d; @@ -294,22 +413,13 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) d->id = id; - d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL); - if (!d->cbm) { + if (domain_setup_ctrlval(r, d)) { kfree(d); return; } - for (i = 0; i < r->num_closid; i++) { - int idx = cbm_idx(r, i); - - d->cbm[i] = r->max_cbm; - wrmsrl(r->msr_base + idx, d->cbm[i]); - } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); - r->num_domains++; } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -325,8 +435,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { - r->num_domains--; - kfree(d->cbm); + kfree(d->ctrl_val); list_del(&d->list); kfree(d); } @@ -374,6 +483,57 @@ static int intel_rdt_offline_cpu(unsigned int cpu) return 0; } +/* + * Choose a width for the resource name and resource data based on the + * resource that has widest name and cbm. + */ +static __init void rdt_init_padding(void) +{ + struct rdt_resource *r; + int cl; + + for_each_capable_rdt_resource(r) { + cl = strlen(r->name); + if (cl > max_name_width) + max_name_width = cl; + + if (r->data_width > max_data_width) + max_data_width = r->data_width; + } +} + +static __init bool get_rdt_resources(void) +{ + bool ret = false; + + if (cache_alloc_hsw_probe()) + return true; + + if (!boot_cpu_has(X86_FEATURE_RDT_A)) + return false; + + if (boot_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); + rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); + } + ret = true; + } + if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + /* CPUID 0x10.2 fields are same format at 0x10.1 */ + rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + ret = true; + } + + if (boot_cpu_has(X86_FEATURE_MBA)) { + if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) + ret = true; + } + + return ret; +} + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -382,6 +542,8 @@ static int __init intel_rdt_late_init(void) if (!get_rdt_resources()) return -ENODEV; + rdt_init_padding(); + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rdt/cat:online:", intel_rdt_online_cpu, intel_rdt_offline_cpu); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 9ac2a5cdd9c2..f5af0cc7eb0d 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -174,6 +174,13 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; +static bool is_cpu_list(struct kernfs_open_file *of) +{ + struct rftype *rft = of->kn->priv; + + return rft->flags & RFTYPE_FLAGS_CPUS_LIST; +} + static int rdtgroup_cpus_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { @@ -182,10 +189,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); - else + if (rdtgrp) { + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->cpu_mask)); + } else { ret = -ENOENT; + } rdtgroup_kn_unlock(of->kn); return ret; @@ -252,7 +261,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } - ret = cpumask_parse(buf, newmask); + if (is_cpu_list(of)) + ret = cpulist_parse(buf, newmask); + else + ret = cpumask_parse(buf, newmask); + if (ret) goto unlock; @@ -473,6 +486,14 @@ static struct rftype rdtgroup_base_files[] = { .seq_show = rdtgroup_cpus_show, }, { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + }, + { .name = "tasks", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -494,32 +515,56 @@ static int rdt_num_closids_show(struct kernfs_open_file *of, struct rdt_resource *r = of->kn->parent->priv; seq_printf(seq, "%d\n", r->num_closid); + return 0; +} + +static int rdt_default_ctrl_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + seq_printf(seq, "%x\n", r->default_ctrl); return 0; } -static int rdt_cbm_mask_show(struct kernfs_open_file *of, +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct rdt_resource *r = of->kn->parent->priv; - seq_printf(seq, "%x\n", r->max_cbm); + seq_printf(seq, "%u\n", r->cache.min_cbm_bits); + return 0; +} + +static int rdt_min_bw_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + seq_printf(seq, "%u\n", r->membw.min_bw); return 0; } -static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, +static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct rdt_resource *r = of->kn->parent->priv; - seq_printf(seq, "%d\n", r->min_cbm_bits); + seq_printf(seq, "%u\n", r->membw.bw_gran); + return 0; +} + +static int rdt_delay_linear_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + seq_printf(seq, "%u\n", r->membw.delay_linear); return 0; } /* rdtgroup information files for one cache resource. */ -static struct rftype res_info_files[] = { +static struct rftype res_cache_info_files[] = { { .name = "num_closids", .mode = 0444, @@ -530,7 +575,7 @@ static struct rftype res_info_files[] = { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_cbm_mask_show, + .seq_show = rdt_default_ctrl_show, }, { .name = "min_cbm_bits", @@ -540,11 +585,52 @@ static struct rftype res_info_files[] = { }, }; +/* rdtgroup information files for memory bandwidth. */ +static struct rftype res_mba_info_files[] = { + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + }, + { + .name = "min_bandwidth", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + }, + { + .name = "bandwidth_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + }, + { + .name = "delay_linear", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_delay_linear_show, + }, +}; + +void rdt_get_mba_infofile(struct rdt_resource *r) +{ + r->info_files = res_mba_info_files; + r->nr_info_files = ARRAY_SIZE(res_mba_info_files); +} + +void rdt_get_cache_infofile(struct rdt_resource *r) +{ + r->info_files = res_cache_info_files; + r->nr_info_files = ARRAY_SIZE(res_cache_info_files); +} + static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { struct kernfs_node *kn_subdir; + struct rftype *res_info_files; struct rdt_resource *r; - int ret; + int ret, len; /* create the directory */ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); @@ -563,8 +649,11 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) ret = rdtgroup_kn_set_ugid(kn_subdir); if (ret) goto out_destroy; - ret = rdtgroup_add_files(kn_subdir, res_info_files, - ARRAY_SIZE(res_info_files)); + + res_info_files = r->info_files; + len = r->nr_info_files; + + ret = rdtgroup_add_files(kn_subdir, res_info_files, len); if (ret) goto out_destroy; kernfs_activate(kn_subdir); @@ -780,7 +869,7 @@ out: return dentry; } -static int reset_all_cbms(struct rdt_resource *r) +static int reset_all_ctrls(struct rdt_resource *r) { struct msr_param msr_param; cpumask_var_t cpu_mask; @@ -803,14 +892,14 @@ static int reset_all_cbms(struct rdt_resource *r) cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); for (i = 0; i < r->num_closid; i++) - d->cbm[i] = r->max_cbm; + d->ctrl_val[i] = r->default_ctrl; } cpu = get_cpu(); /* Update CBM on this cpu if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_cbm_update(&msr_param); + rdt_ctrl_update(&msr_param); /* Update CBM on all other cpus in cpu_mask. */ - smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); free_cpumask_var(cpu_mask); @@ -896,7 +985,7 @@ static void rdt_kill_sb(struct super_block *sb) /*Put everything back to default values. */ for_each_enabled_rdt_resource(r) - reset_all_cbms(r); + reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); static_branch_disable(&rdt_enable_key); diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c index f369cb8db0d5..406d7a6532f9 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c @@ -29,26 +29,77 @@ #include <asm/intel_rdt.h> /* + * Check whether MBA bandwidth percentage value is correct. The value is + * checked against the minimum and max bandwidth values specified by the + * hardware. The allocated bandwidth percentage is rounded to the next + * control step available on the hardware. + */ +static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) +{ + unsigned long bw; + int ret; + + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (!r->membw.delay_linear) + return false; + + ret = kstrtoul(buf, 10, &bw); + if (ret) + return false; + + if (bw < r->membw.min_bw || bw > r->default_ctrl) + return false; + + *data = roundup(bw, (unsigned long)r->membw.bw_gran); + return true; +} + +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) +{ + unsigned long data; + + if (d->have_new_ctrl) + return -EINVAL; + + if (!bw_validate(buf, &data, r)) + return -EINVAL; + d->new_ctrl = data; + d->have_new_ctrl = true; + + return 0; +} + +/* * Check whether a cache bit mask is valid. The SDM says: * Please note that all (and only) contiguous '1' combinations * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). * Additionally Haswell requires at least two bits set. */ -static bool cbm_validate(unsigned long var, struct rdt_resource *r) +static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) { - unsigned long first_bit, zero_bit; + unsigned long first_bit, zero_bit, val; + unsigned int cbm_len = r->cache.cbm_len; + int ret; + + ret = kstrtoul(buf, 16, &val); + if (ret) + return false; - if (var == 0 || var > r->max_cbm) + if (val == 0 || val > r->default_ctrl) return false; - first_bit = find_first_bit(&var, r->cbm_len); - zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit); + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len) + if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) return false; - if ((zero_bit - first_bit) < r->min_cbm_bits) + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) return false; + + *data = val; return true; } @@ -56,17 +107,17 @@ static bool cbm_validate(unsigned long var, struct rdt_resource *r) * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -static int parse_cbm(char *buf, struct rdt_resource *r) +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; - int ret; - ret = kstrtoul(buf, 16, &data); - if (ret) - return ret; - if (!cbm_validate(data, r)) + if (d->have_new_ctrl) return -EINVAL; - r->tmp_cbms[r->num_tmp_cbms++] = data; + + if(!cbm_validate(buf, &data, r)) + return -EINVAL; + d->new_ctrl = data; + d->have_new_ctrl = true; return 0; } @@ -74,8 +125,8 @@ static int parse_cbm(char *buf, struct rdt_resource *r) /* * For each domain in this resource we expect to find a series of: * id=mask - * separated by ";". The "id" is in decimal, and must appear in the - * right order. + * separated by ";". The "id" is in decimal, and must match one of + * the "id"s for this resource. */ static int parse_line(char *line, struct rdt_resource *r) { @@ -83,21 +134,22 @@ static int parse_line(char *line, struct rdt_resource *r) struct rdt_domain *d; unsigned long dom_id; +next: + if (!line || line[0] == '\0') + return 0; + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) + return -EINVAL; + dom = strim(dom); list_for_each_entry(d, &r->domains, list) { - dom = strsep(&line, ";"); - if (!dom) - return -EINVAL; - id = strsep(&dom, "="); - if (kstrtoul(id, 10, &dom_id) || dom_id != d->id) - return -EINVAL; - if (parse_cbm(dom, r)) - return -EINVAL; + if (d->id == dom_id) { + if (r->parse_ctrlval(dom, r, d)) + return -EINVAL; + goto next; + } } - - /* Any garbage at the end of the line? */ - if (line && line[0]) - return -EINVAL; - return 0; + return -EINVAL; } static int update_domains(struct rdt_resource *r, int closid) @@ -105,7 +157,7 @@ static int update_domains(struct rdt_resource *r, int closid) struct msr_param msr_param; cpumask_var_t cpu_mask; struct rdt_domain *d; - int cpu, idx = 0; + int cpu; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -115,30 +167,46 @@ static int update_domains(struct rdt_resource *r, int closid) msr_param.res = r; list_for_each_entry(d, &r->domains, list) { - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); - d->cbm[msr_param.low] = r->tmp_cbms[idx++]; + if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + d->ctrl_val[closid] = d->new_ctrl; + } } + if (cpumask_empty(cpu_mask)) + goto done; cpu = get_cpu(); /* Update CBM on this cpu if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_cbm_update(&msr_param); + rdt_ctrl_update(&msr_param); /* Update CBM on other cpus. */ - smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); +done: free_cpumask_var(cpu_mask); return 0; } +static int rdtgroup_parse_resource(char *resname, char *tok, int closid) +{ + struct rdt_resource *r; + + for_each_enabled_rdt_resource(r) { + if (!strcmp(resname, r->name) && closid < r->num_closid) + return parse_line(tok, r); + } + return -EINVAL; +} + ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + struct rdt_domain *dom; struct rdt_resource *r; char *tok, *resname; int closid, ret = 0; - u32 *l3_cbms = NULL; /* Valid input requires a trailing newline */ if (nbytes == 0 || buf[nbytes - 1] != '\n') @@ -153,44 +221,20 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, closid = rdtgrp->closid; - /* get scratch space to save all the masks while we validate input */ for_each_enabled_rdt_resource(r) { - r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms), - GFP_KERNEL); - if (!r->tmp_cbms) { - ret = -ENOMEM; - goto out; - } - r->num_tmp_cbms = 0; + list_for_each_entry(dom, &r->domains, list) + dom->have_new_ctrl = false; } while ((tok = strsep(&buf, "\n")) != NULL) { - resname = strsep(&tok, ":"); + resname = strim(strsep(&tok, ":")); if (!tok) { ret = -EINVAL; goto out; } - for_each_enabled_rdt_resource(r) { - if (!strcmp(resname, r->name) && - closid < r->num_closid) { - ret = parse_line(tok, r); - if (ret) - goto out; - break; - } - } - if (!r->name) { - ret = -EINVAL; - goto out; - } - } - - /* Did the parser find all the masks we need? */ - for_each_enabled_rdt_resource(r) { - if (r->num_tmp_cbms != r->num_domains) { - ret = -EINVAL; + ret = rdtgroup_parse_resource(resname, tok, closid); + if (ret) goto out; - } } for_each_enabled_rdt_resource(r) { @@ -201,10 +245,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, out: rdtgroup_kn_unlock(of->kn); - for_each_enabled_rdt_resource(r) { - kfree(r->tmp_cbms); - r->tmp_cbms = NULL; - } return ret ?: nbytes; } @@ -213,11 +253,12 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) struct rdt_domain *dom; bool sep = false; - seq_printf(s, "%s:", r->name); + seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(dom, &r->domains, list) { if (sep) seq_puts(s, ";"); - seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]); + seq_printf(s, r->format_str, dom->id, max_data_width, + dom->ctrl_val[closid]); sep = true; } seq_puts(s, "\n"); diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index a3311c886194..43051f0777d4 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o obj-$(CONFIG_ACPI_APEI) += mce-apei.o + +obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c new file mode 100644 index 000000000000..9c632cb88546 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -0,0 +1,397 @@ +/* + * /dev/mcelog driver + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/miscdevice.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/poll.h> + +#include "mce-internal.h" + +static DEFINE_MUTEX(mce_chrdev_read_mutex); + +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; + +#define mce_log_get_idx_check(p) \ +({ \ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&mce_chrdev_read_mutex), \ + "suspicious mce_log_get_idx_check() usage"); \ + smp_load_acquire(&(p)); \ +}) + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log_buffer mcelog = { + .signature = MCE_LOG_SIGNATURE, + .len = MCE_LOG_LEN, + .recordlen = sizeof(struct mce), +}; + +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + +/* User mode helper program triggered by machine check event */ +extern char mce_helper[128]; + +static int dev_mce_log(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned int next, entry; + + wmb(); + for (;;) { + entry = mce_log_get_idx_check(mcelog.next); + for (;;) { + + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, + (unsigned long *)&mcelog.flags); + return NOTIFY_OK; + } + /* Old left over entry. Skip: */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; + } + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + wmb(); + mcelog.entry[entry].finished = 1; + wmb(); + + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); + + return NOTIFY_OK; +} + +static struct notifier_block dev_mcelog_nb = { + .notifier_call = dev_mce_log, + .priority = MCE_PRIO_MCELOG, +}; + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + + +void mce_work_trigger(void) +{ + if (mce_helper[0]) + schedule_work(&mce_trigger_work); +} + +static ssize_t +show_trigger(struct device *s, struct device_attribute *attr, char *buf) +{ + strcpy(buf, mce_helper); + strcat(buf, "\n"); + return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + p = strchr(mce_helper, '\n'); + + if (p) + *p = 0; + + return strlen(mce_helper) + !!p; +} + +DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); + +/* + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ + +static int mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; + + spin_unlock(&mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; + + spin_unlock(&mce_chrdev_state_lock); + + return 0; +} + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + + cpu_tsc[smp_processor_id()] = rdtsc(); +} + +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned long *cpu_tsc; + unsigned prev, next; + int i, err; + + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + + mutex_lock(&mce_chrdev_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + + next = mce_log_get_idx_check(mcelog.next); + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + goto out; + + err = 0; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + struct mce *m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(m, 0, sizeof(*m)); + goto timeout; + } + cpu_relax(); + } + smp_rmb(); + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); +timeout: + ; + } + + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); + + synchronize_sched(); + + /* + * Collect entries that were still getting written before the + * synchronize. + */ + on_each_cpu(collect_tscs, cpu_tsc, 1); + + for (i = next; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + + if (m->finished && m->tsc < cpu_tsc[m->cpu]) { + err |= copy_to_user(buf, m, sizeof(*m)); + smp_rmb(); + buf += sizeof(*m); + memset(m, 0, sizeof(*m)); + } + } + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mce_chrdev_read_mutex); + kfree(cpu_tsc); + + return err ? err : buf - ubuf; +} + +static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_chrdev_wait, wait); + if (READ_ONCE(mcelog.next)) + return POLLIN | POLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return POLLIN | POLLRDNORM; + return 0; +} + +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off); + +void register_mce_write_callback(ssize_t (*fn)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)) +{ + mce_write = fn; +} +EXPORT_SYMBOL_GPL(register_mce_write_callback); + +static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + if (mce_write) + return mce_write(filp, ubuf, usize, off); + else + return -EINVAL; +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .write = mce_chrdev_write, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +static __init int dev_mcelog_init_device(void) +{ + int err; + + /* register character device /dev/mcelog */ + err = misc_register(&mce_chrdev_device); + if (err) { + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + return err; + } + mce_register_decode_chain(&dev_mcelog_nb); + return 0; +} +device_initcall_sync(dev_mcelog_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 1e5a50c11d3c..217cd4449bc9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -85,7 +85,7 @@ void mce_gen_pool_process(struct work_struct *__unused) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { mce = &node->mce; - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 903043e6a62b..654ad0668d72 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -13,7 +13,7 @@ enum severity_level { MCE_PANIC_SEVERITY, }; -extern struct atomic_notifier_head x86_mce_decoder_chain; +extern struct blocking_notifier_head x86_mce_decoder_chain; #define ATTR_LEN 16 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ @@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2) m1->addr != m2->addr || m1->misc != m2->misc; } + +extern struct device_attribute dev_attr_trigger; + +#ifdef CONFIG_X86_MCELOG_LEGACY +extern void mce_work_trigger(void); +#else +static inline void mce_work_trigger(void) { } +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8e9725c607ea..5abd4bf73d6e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -35,6 +35,7 @@ #include <linux/poll.h> #include <linux/nmi.h> #include <linux/cpu.h> +#include <linux/ras.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/mm.h> @@ -49,18 +50,11 @@ #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/reboot.h> #include "mce-internal.h" -static DEFINE_MUTEX(mce_chrdev_read_mutex); - -#define mce_log_get_idx_check(p) \ -({ \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ - !lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious mce_log_get_idx_check() usage"); \ - smp_load_acquire(&(p)); \ -}) +static DEFINE_MUTEX(mce_log_mutex); #define CREATE_TRACE_POINTS #include <trace/events/mce.h> @@ -85,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -/* User mode helper program triggered by machine check event */ -static unsigned long mce_need_notify; -static char mce_helper[128]; -static char *mce_helper_argv[2] = { mce_helper, NULL }; - -static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); - static DEFINE_PER_CPU(struct mce, mces_seen); -static int cpu_missing; +static unsigned long mce_need_notify; +static int cpu_missing; /* * MCA banks polled by the period polling timer for corrected events. @@ -121,7 +109,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) @@ -143,82 +131,38 @@ void mce_setup(struct mce *m) DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -static struct mce_log mcelog = { - .signature = MCE_LOG_SIGNATURE, - .len = MCE_LOG_LEN, - .recordlen = sizeof(struct mce), -}; - -void mce_log(struct mce *mce) +void mce_log(struct mce *m) { - unsigned next, entry; - - /* Emit the trace record: */ - trace_mce_record(mce); - - if (!mce_gen_pool_add(mce)) + if (!mce_gen_pool_add(m)) irq_work_queue(&mce_irq_work); - - wmb(); - for (;;) { - entry = mce_log_get_idx_check(mcelog.next); - for (;;) { - - /* - * When the buffer fills up discard new entries. - * Assume that the earlier errors are the more - * interesting ones: - */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, - (unsigned long *)&mcelog.flags); - return; - } - /* Old left over entry. Skip: */ - if (mcelog.entry[entry].finished) { - entry++; - continue; - } - break; - } - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; - } - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - wmb(); - mcelog.entry[entry].finished = 1; - wmb(); - - set_bit(0, &mce_need_notify); } void mce_inject_log(struct mce *m) { - mutex_lock(&mce_chrdev_read_mutex); + mutex_lock(&mce_log_mutex); mce_log(m); - mutex_unlock(&mce_chrdev_read_mutex); + mutex_unlock(&mce_log_mutex); } EXPORT_SYMBOL_GPL(mce_inject_log); static struct notifier_block mce_srao_nb; +/* + * We run the default notifier if we have only the SRAO, the first and the + * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS + * notifiers registered on the chain. + */ +#define NUM_DEFAULT_NOTIFIERS 3 static atomic_t num_notifiers; void mce_register_decode_chain(struct notifier_block *nb) { - atomic_inc(&num_notifiers); + if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) + return; - WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC); + atomic_inc(&num_notifiers); - atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); @@ -226,7 +170,7 @@ void mce_unregister_decode_chain(struct notifier_block *nb) { atomic_dec(&num_notifiers); - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -319,18 +263,7 @@ static void __print_mce(struct mce *m) static void print_mce(struct mce *m) { - int ret = 0; - __print_mce(m); - - /* - * Print out human-readable details about the MCE error, - * (if the CPU has an implementation for that) - */ - ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); - if (ret == NOTIFY_STOP) - return; - pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } @@ -519,7 +452,6 @@ static void mce_schedule_work(void) static void mce_irq_work_cb(struct irq_work *entry) { - mce_notify_irq(); mce_schedule_work(); } @@ -548,20 +480,97 @@ static void mce_report_event(struct pt_regs *regs) */ static int mce_usable_address(struct mce *m) { - if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) + if (!(m->status & MCI_STATUS_ADDRV)) return 0; /* Checks after this one are Intel-specific: */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 1; + if (!(m->status & MCI_STATUS_MISCV)) + return 0; + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) return 0; + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) return 0; + return 1; } +static bool memory_error(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor == X86_VENDOR_AMD) { + /* ErrCodeExt[20:16] */ + u8 xec = (m->status >> 16) & 0x1f; + + return (xec == 0x0 || xec == 0x8); + } else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + } + + return false; +} + +static bool cec_add_mce(struct mce *m) +{ + if (!m) + return false; + + /* We eat only correctable DRAM errors with usable addresses. */ + if (memory_error(m) && + !(m->status & MCI_STATUS_UC) && + mce_usable_address(m)) + if (!cec_add_elem(m->addr >> PAGE_SHIFT)) + return true; + + return false; +} + +static int mce_first_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + if (cec_add_mce(m)) + return NOTIFY_STOP; + + /* Emit the trace record: */ + trace_mce_record(m); + + set_bit(0, &mce_need_notify); + + mce_notify_irq(); + + return NOTIFY_DONE; +} + +static struct notifier_block first_nb = { + .notifier_call = mce_first_notifier, + .priority = MCE_PRIO_FIRST, +}; + static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -591,11 +600,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - /* - * Run the default notifier if we have only the SRAO - * notifier and us registered. - */ - if (atomic_read(&num_notifiers) > 2) + if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) return NOTIFY_DONE; __print_mce(m); @@ -648,37 +653,6 @@ static void mce_read_aux(struct mce *m, int i) } } -static bool memory_error(struct mce *m) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86_vendor == X86_VENDOR_AMD) { - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; - - return (xec == 0x0 || xec == 0x8); - } else if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * Intel SDM Volume 3B - 15.9.2 Compound Error Codes - * - * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for - * indicating a memory error. Bit 8 is used for indicating a - * cache hierarchy error. The combination of bit 2 and bit 3 - * is used for indicating a `generic' cache hierarchy error - * But we can't just blindly check the above bits, because if - * bit 11 is set, then it is a bus/interconnect error - and - * either way the above bits just gives more detail on what - * bus/interconnect error happened. Note that bit 12 can be - * ignored, as it's the "filter" bit. - */ - return (m->status & 0xef80) == BIT(7) || - (m->status & 0xef00) == BIT(8) || - (m->status & 0xeffc) == 0xc; - } - - return false; -} - DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -1127,9 +1101,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) * on Intel. */ int lmce = 1; + int cpu = smp_processor_id(); - /* If this CPU is offline, just bail out. */ - if (cpu_is_offline(smp_processor_id())) { + /* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ + if (cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); @@ -1399,13 +1386,6 @@ static void mce_timer_delete_all(void) del_timer_sync(&per_cpu(mce_timer, cpu)); } -static void mce_do_trigger(struct work_struct *work) -{ - call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); -} - -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - /* * Notify the user(s) about new machine check events. * Can be called from interrupt context, but not from machine check/NMI @@ -1417,11 +1397,7 @@ int mce_notify_irq(void) static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); if (test_and_clear_bit(0, &mce_need_notify)) { - /* wake processes polling /dev/mcelog */ - wake_up_interruptible(&mce_chrdev_wait); - - if (mce_helper[0]) - schedule_work(&mce_trigger_work); + mce_work_trigger(); if (__ratelimit(&ratelimit)) pr_info(HW_ERR "Machine check events logged\n"); @@ -1688,30 +1664,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return 0; } -static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +/* + * Init basic CPU features needed for early decoding of MCEs. + */ +static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) { - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; - break; - - case X86_VENDOR_AMD: { + if (c->x86_vendor == X86_VENDOR_AMD) { mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - /* - * Install proper ops for Scalable MCA enabled processors - */ if (mce_flags.smca) { msr_ops.ctl = smca_ctl_reg; msr_ops.status = smca_status_reg; msr_ops.addr = smca_addr_reg; msr_ops.misc = smca_misc_reg; } - mce_amd_feature_init(c); + } +} + +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + mce_adjust_timer = cmci_intel_adjust_timer; + break; + case X86_VENDOR_AMD: { + mce_amd_feature_init(c); break; } @@ -1798,6 +1779,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) machine_check_vector = do_machine_check; + __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); @@ -1823,252 +1805,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c) } -/* - * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_chrdev_state_lock); -static int mce_chrdev_open_count; /* #times opened */ -static int mce_chrdev_open_exclu; /* already open exclusive? */ - -static int mce_chrdev_open(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - if (mce_chrdev_open_exclu || - (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_chrdev_state_lock); - - return -EBUSY; - } - - if (file->f_flags & O_EXCL) - mce_chrdev_open_exclu = 1; - mce_chrdev_open_count++; - - spin_unlock(&mce_chrdev_state_lock); - - return nonseekable_open(inode, file); -} - -static int mce_chrdev_release(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - mce_chrdev_open_count--; - mce_chrdev_open_exclu = 0; - - spin_unlock(&mce_chrdev_state_lock); - - return 0; -} - -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - - cpu_tsc[smp_processor_id()] = rdtsc(); -} - -static int mce_apei_read_done; - -/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ -static int __mce_read_apei(char __user **ubuf, size_t usize) -{ - int rc; - u64 record_id; - struct mce m; - - if (usize < sizeof(struct mce)) - return -EINVAL; - - rc = apei_read_mce(&m, &record_id); - /* Error or no more MCE record */ - if (rc <= 0) { - mce_apei_read_done = 1; - /* - * When ERST is disabled, mce_chrdev_read() should return - * "no record" instead of "no device." - */ - if (rc == -ENODEV) - return 0; - return rc; - } - rc = -EFAULT; - if (copy_to_user(*ubuf, &m, sizeof(struct mce))) - return rc; - /* - * In fact, we should have cleared the record after that has - * been flushed to the disk or sent to network in - * /sbin/mcelog, but we have no interface to support that now, - * so just clear it to avoid duplication. - */ - rc = apei_clear_mce(record_id); - if (rc) { - mce_apei_read_done = 1; - return rc; - } - *ubuf += sizeof(struct mce); - - return 0; -} - -static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, - size_t usize, loff_t *off) -{ - char __user *buf = ubuf; - unsigned long *cpu_tsc; - unsigned prev, next; - int i, err; - - cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - - mutex_lock(&mce_chrdev_read_mutex); - - if (!mce_apei_read_done) { - err = __mce_read_apei(&buf, usize); - if (err || buf != ubuf) - goto out; - } - - next = mce_log_get_idx_check(mcelog.next); - - /* Only supports full reads right now */ - err = -EINVAL; - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) - goto out; - - err = 0; - prev = 0; - do { - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - struct mce *m = &mcelog.entry[i]; - - while (!m->finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(m, 0, sizeof(*m)); - goto timeout; - } - cpu_relax(); - } - smp_rmb(); - err |= copy_to_user(buf, m, sizeof(*m)); - buf += sizeof(*m); -timeout: - ; - } - - memset(mcelog.entry + prev, 0, - (next - prev) * sizeof(struct mce)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); - - synchronize_sched(); - - /* - * Collect entries that were still getting written before the - * synchronize. - */ - on_each_cpu(collect_tscs, cpu_tsc, 1); - - for (i = next; i < MCE_LOG_LEN; i++) { - struct mce *m = &mcelog.entry[i]; - - if (m->finished && m->tsc < cpu_tsc[m->cpu]) { - err |= copy_to_user(buf, m, sizeof(*m)); - smp_rmb(); - buf += sizeof(*m); - memset(m, 0, sizeof(*m)); - } - } - - if (err) - err = -EFAULT; - -out: - mutex_unlock(&mce_chrdev_read_mutex); - kfree(cpu_tsc); - - return err ? err : buf - ubuf; -} - -static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &mce_chrdev_wait, wait); - if (READ_ONCE(mcelog.next)) - return POLLIN | POLLRDNORM; - if (!mce_apei_read_done && apei_check_mce()) - return POLLIN | POLLRDNORM; - return 0; -} - -static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, - unsigned long arg) -{ - int __user *p = (int __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off); - -void register_mce_write_callback(ssize_t (*fn)(struct file *filp, - const char __user *ubuf, - size_t usize, loff_t *off)) -{ - mce_write = fn; -} -EXPORT_SYMBOL_GPL(register_mce_write_callback); - -static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) -{ - if (mce_write) - return mce_write(filp, ubuf, usize, off); - else - return -EINVAL; -} - -static const struct file_operations mce_chrdev_ops = { - .open = mce_chrdev_open, - .release = mce_chrdev_release, - .read = mce_chrdev_read, - .write = mce_chrdev_write, - .poll = mce_chrdev_poll, - .unlocked_ioctl = mce_chrdev_ioctl, - .llseek = no_llseek, -}; - -static struct miscdevice mce_chrdev_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - static void __mce_disable_bank(void *arg) { int bank = *((int *)arg); @@ -2142,6 +1878,7 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); + mce_register_decode_chain(&first_nb); mce_register_decode_chain(&mce_srao_nb); mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); @@ -2286,29 +2023,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return size; } -static ssize_t -show_trigger(struct device *s, struct device_attribute *attr, char *buf) -{ - strcpy(buf, mce_helper); - strcat(buf, "\n"); - return strlen(mce_helper) + 1; -} - -static ssize_t set_trigger(struct device *s, struct device_attribute *attr, - const char *buf, size_t siz) -{ - char *p; - - strncpy(mce_helper, buf, sizeof(mce_helper)); - mce_helper[sizeof(mce_helper)-1] = 0; - p = strchr(mce_helper, '\n'); - - if (p) - *p = 0; - - return strlen(mce_helper) + !!p; -} - static ssize_t set_ignore_ce(struct device *s, struct device_attribute *attr, const char *buf, size_t size) @@ -2365,7 +2079,6 @@ static ssize_t store_int_with_restart(struct device *s, return ret; } -static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); @@ -2388,7 +2101,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { static struct device_attribute *mce_device_attrs[] = { &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, +#ifdef CONFIG_X86_MCELOG_LEGACY &dev_attr_trigger, +#endif &dev_attr_monarch_timeout.attr, &dev_attr_dont_log_ce.attr, &dev_attr_ignore_ce.attr, @@ -2562,7 +2277,6 @@ static __init void mce_init_banks(void) static __init int mcheck_init_device(void) { - enum cpuhp_state hp_online; int err; if (!mce_available(&boot_cpu_data)) { @@ -2590,21 +2304,11 @@ static __init int mcheck_init_device(void) mce_cpu_online, mce_cpu_pre_down); if (err < 0) goto err_out_online; - hp_online = err; register_syscore_ops(&mce_syscore_ops); - /* register character device /dev/mcelog */ - err = misc_register(&mce_chrdev_device); - if (err) - goto err_register; - return 0; -err_register: - unregister_syscore_ops(&mce_syscore_ops); - cpuhp_remove_state(hp_online); - err_out_online: cpuhp_remove_state(CPUHP_X86_MCE_DEAD); @@ -2612,7 +2316,7 @@ err_out_mem: free_cpumask_var(mce_device_initialized); err_out: - pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + pr_err("Unable to init MCE device (rc: %d)\n", err); return err; } @@ -2691,6 +2395,7 @@ static int __init mcheck_late_init(void) static_branch_inc(&mcsafe_key); mcheck_debugfs_init(); + cec_init(); /* * Flush out everything that has been logged during early boot, now that diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 524cc5780a77..6e4a047e4b68 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -60,7 +60,7 @@ static const char * const th_names[] = { "load_store", "insn_fetch", "combined_unit", - "", + "decode_unit", "northbridge", "execution_unit", }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 190b3e6cef4d..e84db79ef272 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) case INTEL_FAM6_BROADWELL_XEON_D: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: + if (rdmsrl_safe(MSR_PPIN_CTL, &val)) return; diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 3b442b64c72d..765afd599039 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -27,7 +27,7 @@ #include <linux/range.h> #include <asm/processor.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> @@ -860,7 +860,7 @@ real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) trim_size <<= PAGE_SHIFT; trim_size -= trim_start; - return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); + return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED); } /** @@ -978,7 +978,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) WARN_ON(1); pr_info("update e820 for mtrr\n"); - update_e820(); + e820__update_table_print(); return 1; } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 24e87e74990d..2bce84d91c2b 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -48,7 +48,7 @@ #include <linux/syscore_ops.h> #include <asm/cpufeature.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/pat.h> diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 18ca99f2798b..6df621ae62a7 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -31,14 +31,13 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu\t\t: %s\n" "fpu_exception\t: %s\n" "cpuid level\t: %d\n" - "wp\t\t: %s\n", + "wp\t\t: yes\n", static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - c->cpuid_level, - c->wp_works_ok ? "yes" : "no"); + c->cpuid_level); } #else static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index d9794060fe22..23c23508c012 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 3741461c63a0..22217ece26c8 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -29,6 +29,7 @@ #include <asm/nmi.h> #include <asm/hw_irq.h> #include <asm/apic.h> +#include <asm/e820/types.h> #include <asm/io_apic.h> #include <asm/hpet.h> #include <linux/kdebug.h> @@ -503,16 +504,16 @@ static int prepare_elf_headers(struct kimage *image, void **addr, return ret; } -static int add_e820_entry(struct boot_params *params, struct e820entry *entry) +static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) { unsigned int nr_e820_entries; nr_e820_entries = params->e820_entries; - if (nr_e820_entries >= E820MAX) + if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) return 1; - memcpy(¶ms->e820_map[nr_e820_entries], entry, - sizeof(struct e820entry)); + memcpy(¶ms->e820_table[nr_e820_entries], entry, + sizeof(struct e820_entry)); params->e820_entries++; return 0; } @@ -521,7 +522,7 @@ static int memmap_entry_callback(u64 start, u64 end, void *arg) { struct crash_memmap_data *cmd = arg; struct boot_params *params = cmd->params; - struct e820entry ei; + struct e820_entry ei; ei.addr = start; ei.size = end - start + 1; @@ -560,7 +561,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) { int i, ret = 0; unsigned long flags; - struct e820entry ei; + struct e820_entry ei; struct crash_memmap_data cmd; struct crash_mem *cmem; @@ -574,17 +575,17 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) /* Add first 640K segment */ ei.addr = image->arch.backup_src_start; ei.size = image->arch.backup_src_sz; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); /* Add ACPI tables */ - cmd.type = E820_ACPI; + cmd.type = E820_TYPE_ACPI; flags = IORESOURCE_MEM | IORESOURCE_BUSY; walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, memmap_entry_callback); /* Add ACPI Non-volatile Storage */ - cmd.type = E820_NVS; + cmd.type = E820_TYPE_NVS; walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, memmap_entry_callback); @@ -592,7 +593,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) if (crashk_low_res.end) { ei.addr = crashk_low_res.start; ei.size = crashk_low_res.end - crashk_low_res.start + 1; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); } @@ -609,7 +610,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) if (ei.size < PAGE_SIZE) continue; ei.addr = cmem->ranges[i].start; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b2bbad6ebe4d..6e9b26fa6d05 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1,49 +1,55 @@ /* - * Handle the memory map. - * The functions here do the job until bootmem takes over. + * Low level x86 E820 memory map handling functions. * - * Getting sanitize_e820_map() in sync with i386 version by applying change: - * - Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach <xela@slit.de>, December 2002. - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * The firmware and bootloader passes us the "E820 table", which is the primary + * physical memory layout description available about x86 systems. * + * The kernel takes the E820 memory layout and optionally modifies it with + * quirks and other tweaks, and feeds that into the generic Linux memory + * allocation code routines via a platform independent interface (memblock, etc.). */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> #include <linux/crash_dump.h> -#include <linux/export.h> #include <linux/bootmem.h> -#include <linux/pfn.h> #include <linux/suspend.h> #include <linux/acpi.h> #include <linux/firmware-map.h> #include <linux/memblock.h> #include <linux/sort.h> -#include <asm/e820.h> -#include <asm/proto.h> +#include <asm/e820/api.h> #include <asm/setup.h> -#include <asm/cpufeature.h> /* - * The e820 map is the map that gets modified e.g. with command line parameters - * and that is also registered with modifications in the kernel resource tree - * with the iomem_resource as parent. + * We organize the E820 table into two main data structures: * - * The e820_saved is directly saved after the BIOS-provided memory map is - * copied. It doesn't get modified afterwards. It's registered for the - * /sys/firmware/memmap interface. + * - 'e820_table_firmware': the original firmware version passed to us by the + * bootloader - not modified by the kernel. We use this to: * - * That memory map is not modified and is used as base for kexec. The kexec'd - * kernel should get the same memory map as the firmware provides. Then the - * user can e.g. boot the original kernel with mem=1G while still booting the - * next kernel with full memory. + * - inform the user about the firmware's notion of memory layout + * via /sys/firmware/memmap + * + * - the hibernation code uses it to generate a kernel-independent MD5 + * fingerprint of the physical memory layout of a system. + * + * - kexec, which is a bootloader in disguise, uses the original E820 + * layout to pass to the kexec-ed kernel. This way the original kernel + * can have a restricted E820 map while the kexec()-ed kexec-kernel + * can have access to full memory - etc. + * + * - 'e820_table': this is the main E820 table that is massaged by the + * low level x86 platform code, or modified by boot parameters, before + * passed on to higher level MM layers. + * + * Once the E820 map has been converted to the standard Linux memory layout + * information its role stops - modifying it has no effect and does not get + * re-propagated. So itsmain role is a temporary bootstrap storage of firmware + * specific memory layout data during early bootup. */ -static struct e820map initial_e820 __initdata; -static struct e820map initial_e820_saved __initdata; -struct e820map *e820 __refdata = &initial_e820; -struct e820map *e820_saved __refdata = &initial_e820_saved; +static struct e820_table e820_table_init __initdata; +static struct e820_table e820_table_firmware_init __initdata; + +struct e820_table *e820_table __refdata = &e820_table_init; +struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -55,51 +61,53 @@ EXPORT_SYMBOL(pci_mem_start); * This function checks if any part of the range <start,end> is mapped * with type. */ -int -e820_any_mapped(u64 start, u64 end, unsigned type) +bool e820__mapped_any(u64 start, u64 end, enum e820_type type) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (type && ei->type != type) + if (type && entry->type != type) continue; - if (ei->addr >= end || ei->addr + ei->size <= start) + if (entry->addr >= end || entry->addr + entry->size <= start) continue; return 1; } return 0; } -EXPORT_SYMBOL_GPL(e820_any_mapped); +EXPORT_SYMBOL_GPL(e820__mapped_any); /* - * This function checks if the entire range <start,end> is mapped with type. + * This function checks if the entire <start,end> range is mapped with 'type'. * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case + * Note: this function only works correctly once the E820 table is sorted and + * not-overlapping (at least for the range specified), which is the case normally. */ -int __init e820_all_mapped(u64 start, u64 end, unsigned type) +bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (type && ei->type != type) + if (type && entry->type != type) continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) + + /* Is the region (part) in overlap with the current region? */ + if (entry->addr >= end || entry->addr + entry->size <= start) continue; - /* if the region is at the beginning of <start,end> we move - * start to the end of the region since it's ok until there + /* + * If the region is at the beginning of <start,end> we move + * 'start' to the end of the region since it's ok until there */ - if (ei->addr <= start) - start = ei->addr + ei->size; + if (entry->addr <= start) + start = entry->addr + entry->size; + /* - * if start is now at or beyond end, we're done, full - * coverage + * If 'start' is now at or beyond 'end', we're done, full + * coverage of the desired range exists: */ if (start >= end) return 1; @@ -108,94 +116,77 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) } /* - * Add a memory region to the kernel e820 map. + * Add a memory region to the kernel E820 map. */ -static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, - int type) +static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) { - int x = e820x->nr_map; + int x = table->nr_entries; - if (x >= ARRAY_SIZE(e820x->map)) { - printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", - (unsigned long long) start, - (unsigned long long) (start + size - 1)); + if (x >= ARRAY_SIZE(table->entries)) { + pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1); return; } - e820x->map[x].addr = start; - e820x->map[x].size = size; - e820x->map[x].type = type; - e820x->nr_map++; + table->entries[x].addr = start; + table->entries[x].size = size; + table->entries[x].type = type; + table->nr_entries++; } -void __init e820_add_region(u64 start, u64 size, int type) +void __init e820__range_add(u64 start, u64 size, enum e820_type type) { - __e820_add_region(e820, start, size, type); + __e820__range_add(e820_table, start, size, type); } -static void __init e820_print_type(u32 type) +static void __init e820_print_type(enum e820_type type) { switch (type) { - case E820_RAM: - case E820_RESERVED_KERN: - printk(KERN_CONT "usable"); - break; - case E820_RESERVED: - printk(KERN_CONT "reserved"); - break; - case E820_ACPI: - printk(KERN_CONT "ACPI data"); - break; - case E820_NVS: - printk(KERN_CONT "ACPI NVS"); - break; - case E820_UNUSABLE: - printk(KERN_CONT "unusable"); - break; - case E820_PMEM: - case E820_PRAM: - printk(KERN_CONT "persistent (type %u)", type); - break; - default: - printk(KERN_CONT "type %u", type); - break; + case E820_TYPE_RAM: /* Fall through: */ + case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; + case E820_TYPE_RESERVED: pr_cont("reserved"); break; + case E820_TYPE_ACPI: pr_cont("ACPI data"); break; + case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; + case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; + case E820_TYPE_PMEM: /* Fall through: */ + case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break; + default: pr_cont("type %u", type); break; } } -void __init e820_print_map(char *who) +void __init e820__print_table(char *who) { int i; - for (i = 0; i < e820->nr_map; i++) { - printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820->map[i].addr, - (unsigned long long) - (e820->map[i].addr + e820->map[i].size - 1)); - e820_print_type(e820->map[i].type); - printk(KERN_CONT "\n"); + for (i = 0; i < e820_table->nr_entries; i++) { + pr_info("%s: [mem %#018Lx-%#018Lx] ", who, + e820_table->entries[i].addr, + e820_table->entries[i].addr + e820_table->entries[i].size - 1); + + e820_print_type(e820_table->entries[i].type); + pr_cont("\n"); } } /* - * Sanitize the BIOS e820 map. + * Sanitize an E820 map. * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps, + * Some E820 layouts include overlapping entries. The following + * replaces the original E820 map with a new one, removing overlaps, * and resolving conflicting memory types in favor of highest * numbered type. * - * The input parameter biosmap points to an array of 'struct - * e820entry' which on entry has elements in the range [0, *pnr_map) - * valid, and which has space for up to max_nr_map entries. - * On return, the resulting sanitized e820 map entries will be in - * overwritten in the same location, starting at biosmap. + * The input parameter 'entries' points to an array of 'struct + * e820_entry' which on entry has elements in the range [0, *nr_entries) + * valid, and which has space for up to max_nr_entries entries. + * On return, the resulting sanitized E820 map entries will be in + * overwritten in the same location, starting at 'entries'. * - * The integer pointed to by pnr_map must be valid on entry (the - * current number of valid entries located at biosmap). If the - * sanitizing succeeds the *pnr_map will be updated with the new - * number of valid entries (something no more than max_nr_map). + * The integer pointed to by nr_entries must be valid on entry (the + * current number of valid entries located at 'entries'). If the + * sanitizing succeeds the *nr_entries will be updated with the new + * number of valid entries (something no more than max_nr_entries). * - * The return value from sanitize_e820_map() is zero if it + * The return value from e820__update_table() is zero if it * successfully 'sanitized' the map entries passed in, and is -1 * if it did nothing, which can happen if either of (1) it was * only passed one map entry, or (2) any of the input map entries @@ -238,10 +229,17 @@ void __init e820_print_map(char *who) * ______________________4_ */ struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ + /* Pointer to the original entry: */ + struct e820_entry *entry; + /* Address for this change point: */ + unsigned long long addr; }; +static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata; +static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata; +static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata; +static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata; + static int __init cpcompare(const void *a, const void *b) { struct change_member * const *app = a, * const *bpp = b; @@ -249,164 +247,141 @@ static int __init cpcompare(const void *a, const void *b) /* * Inputs are pointers to two elements of change_point[]. If their - * addresses are unequal, their difference dominates. If the addresses + * addresses are not equal, their difference dominates. If the addresses * are equal, then consider one that represents the end of its region * to be greater than one that does not. */ if (ap->addr != bp->addr) return ap->addr > bp->addr ? 1 : -1; - return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr); + return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr); } -int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, - u32 *pnr_map) +int __init e820__update_table(struct e820_table *table) { - static struct change_member change_point_list[2*E820_X_MAX] __initdata; - static struct change_member *change_point[2*E820_X_MAX] __initdata; - static struct e820entry *overlap_list[E820_X_MAX] __initdata; - static struct e820entry new_bios[E820_X_MAX] __initdata; - unsigned long current_type, last_type; + struct e820_entry *entries = table->entries; + u32 max_nr_entries = ARRAY_SIZE(table->entries); + enum e820_type current_type, last_type; unsigned long long last_addr; - int chgidx; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; + u32 new_nr_entries, overlap_entries; + u32 i, chg_idx, chg_nr; - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) + /* If there's only one memory region, don't bother: */ + if (table->nr_entries < 2) return -1; - old_nr = *pnr_map; - BUG_ON(old_nr > max_nr_map); + table->nr_entries = table->nr_entries; + BUG_ON(table->nr_entries > max_nr_entries); - /* bail out if we find any unreasonable addresses in bios map */ - for (i = 0; i < old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + /* Bail out if we find any unreasonable addresses in the map: */ + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].addr + entries[i].size < entries[i].addr) return -1; + } - /* create pointers for initial change-point information (for sorting) */ - for (i = 0; i < 2 * old_nr; i++) + /* Create pointers for initial change-point information (for sorting): */ + for (i = 0; i < 2 * table->nr_entries; i++) change_point[i] = &change_point_list[i]; - /* record all known change-points (starting and ending addresses), - omitting those that are for empty memory regions */ - chgidx = 0; - for (i = 0; i < old_nr; i++) { - if (biosmap[i].size != 0) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + - biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; + /* + * Record all known change-points (starting and ending addresses), + * omitting empty memory regions: + */ + chg_idx = 0; + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].size != 0) { + change_point[chg_idx]->addr = entries[i].addr; + change_point[chg_idx++]->entry = &entries[i]; + change_point[chg_idx]->addr = entries[i].addr + entries[i].size; + change_point[chg_idx++]->entry = &entries[i]; } } - chg_nr = chgidx; - - /* sort change-point list by memory addresses (low -> high) */ - sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL); - - /* create a new bios memory map, removing overlaps */ - overlap_entries = 0; /* number of entries in the overlap table */ - new_bios_entry = 0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - - /* loop through change-points, determining affect on the new bios map */ - for (chgidx = 0; chgidx < chg_nr; chgidx++) { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == - change_point[chgidx]->pbios->addr) { - /* - * add map entry to overlap list (> 1 entry - * implies an overlap) - */ - overlap_list[overlap_entries++] = - change_point[chgidx]->pbios; + chg_nr = chg_idx; + + /* Sort change-point list by memory addresses (low -> high): */ + sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL); + + /* Create a new memory map, removing overlaps: */ + overlap_entries = 0; /* Number of entries in the overlap table */ + new_nr_entries = 0; /* Index for creating new map entries */ + last_type = 0; /* Start with undefined memory type */ + last_addr = 0; /* Start with 0 as last starting address */ + + /* Loop through change-points, determining effect on the new map: */ + for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) { + /* Keep track of all overlapping entries */ + if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) { + /* Add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++] = change_point[chg_idx]->entry; } else { - /* - * remove entry from list (order independent, - * so swap with last) - */ + /* Remove entry from list (order independent, so swap with last): */ for (i = 0; i < overlap_entries; i++) { - if (overlap_list[i] == - change_point[chgidx]->pbios) - overlap_list[i] = - overlap_list[overlap_entries-1]; + if (overlap_list[i] == change_point[chg_idx]->entry) + overlap_list[i] = overlap_list[overlap_entries-1]; } overlap_entries--; } /* - * if there are overlapping entries, decide which + * If there are overlapping entries, decide which * "type" to use (larger value takes precedence -- * 1=usable, 2,3,4,4+=unusable) */ current_type = 0; - for (i = 0; i < overlap_entries; i++) + for (i = 0; i < overlap_entries; i++) { if (overlap_list[i]->type > current_type) current_type = overlap_list[i]->type; - /* - * continue building up new bios map based on this - * information - */ - if (current_type != last_type || current_type == E820_PRAM) { + } + + /* Continue building up new map based on this information: */ + if (current_type != last_type || current_type == E820_TYPE_PRAM) { if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* - * move forward only if the new size - * was non-zero - */ - if (new_bios[new_bios_entry].size != 0) - /* - * no more space left for new - * bios entries ? - */ - if (++new_bios_entry >= max_nr_map) + new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; + /* Move forward only if the new size was non-zero: */ + if (new_entries[new_nr_entries].size != 0) + /* No more space left for new entries? */ + if (++new_nr_entries >= max_nr_entries) break; } if (current_type != 0) { - new_bios[new_bios_entry].addr = - change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr = change_point[chgidx]->addr; + new_entries[new_nr_entries].addr = change_point[chg_idx]->addr; + new_entries[new_nr_entries].type = current_type; + last_addr = change_point[chg_idx]->addr; } last_type = current_type; } } - /* retain count for new bios entries */ - new_nr = new_bios_entry; - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); - *pnr_map = new_nr; + /* Copy the new entries into the original location: */ + memcpy(entries, new_entries, new_nr_entries*sizeof(*entries)); + table->nr_entries = new_nr_entries; return 0; } -static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) +static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { - while (nr_map) { - u64 start = biosmap->addr; - u64 size = biosmap->size; + struct boot_e820_entry *entry = entries; + + while (nr_entries) { + u64 start = entry->addr; + u64 size = entry->size; u64 end = start + size - 1; - u32 type = biosmap->type; + u32 type = entry->type; - /* Overflow in 64 bits? Ignore the memory map. */ + /* Ignore the entry on 64-bit overflow: */ if (start > end && likely(size)) return -1; - e820_add_region(start, size, type); + e820__range_add(start, size, type); - biosmap++; - nr_map--; + entry++; + nr_entries--; } return 0; } /* - * Copy the BIOS e820 map into a safe place. + * Copy the BIOS E820 map into a safe place. * * Sanity-check it while we're at it.. * @@ -414,18 +389,17 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. */ -static int __init append_e820_map(struct e820entry *biosmap, int nr_map) +static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) + if (nr_entries < 2) return -1; - return __append_e820_map(biosmap, nr_map); + return __append_e820_table(entries, nr_entries); } -static u64 __init __e820_update_range(struct e820map *e820x, u64 start, - u64 size, unsigned old_type, - unsigned new_type) +static u64 __init +__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { u64 end; unsigned int i; @@ -437,77 +411,73 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", - (unsigned long long) start, (unsigned long long) (end - 1)); + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1); e820_print_type(old_type); - printk(KERN_CONT " ==> "); + pr_cont(" ==> "); e820_print_type(new_type); - printk(KERN_CONT "\n"); + pr_cont("\n"); - for (i = 0; i < e820x->nr_map; i++) { - struct e820entry *ei = &e820x->map[i]; + for (i = 0; i < table->nr_entries; i++) { + struct e820_entry *entry = &table->entries[i]; u64 final_start, final_end; - u64 ei_end; + u64 entry_end; - if (ei->type != old_type) + if (entry->type != old_type) continue; - ei_end = ei->addr + ei->size; - /* totally covered by new range? */ - if (ei->addr >= start && ei_end <= end) { - ei->type = new_type; - real_updated_size += ei->size; + entry_end = entry->addr + entry->size; + + /* Completely covered by new range? */ + if (entry->addr >= start && entry_end <= end) { + entry->type = new_type; + real_updated_size += entry->size; continue; } - /* new range is totally covered? */ - if (ei->addr < start && ei_end > end) { - __e820_add_region(e820x, start, size, new_type); - __e820_add_region(e820x, end, ei_end - end, ei->type); - ei->size = start - ei->addr; + /* New range is completely covered? */ + if (entry->addr < start && entry_end > end) { + __e820__range_add(table, start, size, new_type); + __e820__range_add(table, end, entry_end - end, entry->type); + entry->size = start - entry->addr; real_updated_size += size; continue; } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(end, ei_end); + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); if (final_start >= final_end) continue; - __e820_add_region(e820x, final_start, final_end - final_start, - new_type); + __e820__range_add(table, final_start, final_end - final_start, new_type); real_updated_size += final_end - final_start; /* - * left range could be head or tail, so need to update - * size at first. + * Left range could be head or tail, so need to update + * its size first: */ - ei->size -= final_end - final_start; - if (ei->addr < final_start) + entry->size -= final_end - final_start; + if (entry->addr < final_start) continue; - ei->addr = final_end; + + entry->addr = final_end; } return real_updated_size; } -u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) +u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { - return __e820_update_range(e820, start, size, old_type, new_type); + return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820_update_range_saved(u64 start, u64 size, - unsigned old_type, unsigned new_type) +static u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { - return __e820_update_range(e820_saved, start, size, old_type, - new_type); + return __e820__range_update(e820_table_firmware, start, size, old_type, new_type); } -/* make e820 not cover the range */ -u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, - int checktype) +/* Remove a range of memory from the E820 table: */ +u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) { int i; u64 end; @@ -517,85 +487,89 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", - (unsigned long long) start, (unsigned long long) (end - 1)); - if (checktype) + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1); + if (check_type) e820_print_type(old_type); - printk(KERN_CONT "\n"); + pr_cont("\n"); - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; u64 final_start, final_end; - u64 ei_end; + u64 entry_end; - if (checktype && ei->type != old_type) + if (check_type && entry->type != old_type) continue; - ei_end = ei->addr + ei->size; - /* totally covered? */ - if (ei->addr >= start && ei_end <= end) { - real_removed_size += ei->size; - memset(ei, 0, sizeof(struct e820entry)); + entry_end = entry->addr + entry->size; + + /* Completely covered? */ + if (entry->addr >= start && entry_end <= end) { + real_removed_size += entry->size; + memset(entry, 0, sizeof(*entry)); continue; } - /* new range is totally covered? */ - if (ei->addr < start && ei_end > end) { - e820_add_region(end, ei_end - end, ei->type); - ei->size = start - ei->addr; + /* Is the new range completely covered? */ + if (entry->addr < start && entry_end > end) { + e820__range_add(end, entry_end - end, entry->type); + entry->size = start - entry->addr; real_removed_size += size; continue; } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(end, ei_end); + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); if (final_start >= final_end) continue; + real_removed_size += final_end - final_start; /* - * left range could be head or tail, so need to update - * size at first. + * Left range could be head or tail, so need to update + * the size first: */ - ei->size -= final_end - final_start; - if (ei->addr < final_start) + entry->size -= final_end - final_start; + if (entry->addr < final_start) continue; - ei->addr = final_end; + + entry->addr = final_end; } return real_removed_size; } -void __init update_e820(void) +void __init e820__update_table_print(void) { - if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map)) + if (e820__update_table(e820_table)) return; - printk(KERN_INFO "e820: modified physical RAM map:\n"); - e820_print_map("modified"); + + pr_info("e820: modified physical RAM map:\n"); + e820__print_table("modified"); } -static void __init update_e820_saved(void) + +static void __init e820__update_table_firmware(void) { - sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map), - &e820_saved->nr_map); + e820__update_table(e820_table_firmware); } + #define MAX_GAP_END 0x100000000ull + /* - * Search for a gap in the e820 memory space from 0 to MAX_GAP_END. + * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). */ -static int __init e820_search_gap(unsigned long *gapstart, - unsigned long *gapsize) +static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize) { unsigned long long last = MAX_GAP_END; - int i = e820->nr_map; + int i = e820_table->nr_entries; int found = 0; while (--i >= 0) { - unsigned long long start = e820->map[i].addr; - unsigned long long end = start + e820->map[i].size; + unsigned long long start = e820_table->entries[i].addr; + unsigned long long end = start + e820_table->entries[i].size; /* * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true + * fit in 32 bits if this condition is true: */ if (last > end) { unsigned long gap = last - end; @@ -613,12 +587,14 @@ static int __init e820_search_gap(unsigned long *gapstart, } /* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. We pass this space to PCI to assign MMIO resources - * for hotplug or unconfigured devices in. + * Search for the biggest gap in the low 32 bits of the E820 + * memory space. We pass this space to the PCI subsystem, so + * that it can assign MMIO resources for hotplug or + * unconfigured devices in. + * * Hopefully the BIOS let enough space left. */ -__init void e820_setup_gap(void) +__init void e820__setup_pci_gap(void) { unsigned long gapstart, gapsize; int found; @@ -629,138 +605,143 @@ __init void e820_setup_gap(void) if (!found) { #ifdef CONFIG_X86_64 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR - "e820: cannot find a gap in the 32bit address range\n" - "e820: PCI devices with unassigned 32bit BARs may break!\n"); + pr_err( + "e820: Cannot find an available gap in the 32-bit address range\n" + "e820: PCI devices with unassigned 32-bit BARs may not work!\n"); #else gapstart = 0x10000000; #endif } /* - * e820_reserve_resources_late protect stolen RAM already + * e820__reserve_resources_late() protects stolen RAM already: */ pci_mem_start = gapstart; - printk(KERN_INFO - "e820: [mem %#010lx-%#010lx] available for PCI devices\n", - gapstart, gapstart + gapsize - 1); + pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1); } /* * Called late during init, in free_initmem(). * - * Initial e820 and e820_saved are largish __initdata arrays. - * Copy them to (usually much smaller) dynamically allocated area. - * This is done after all tweaks we ever do to them: - * all functions which modify them are __init functions, - * they won't exist after this point. + * Initial e820_table and e820_table_firmware are largish __initdata arrays. + * + * Copy them to a (usually much smaller) dynamically allocated area that is + * sized precisely after the number of e820 entries. + * + * This is done after we've performed all the fixes and tweaks to the tables. + * All functions which modify them are __init functions, which won't exist + * after free_initmem(). */ -__init void e820_reallocate_tables(void) +__init void e820__reallocate_tables(void) { - struct e820map *n; + struct e820_table *n; int size; - size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries; n = kmalloc(size, GFP_KERNEL); BUG_ON(!n); - memcpy(n, e820, size); - e820 = n; + memcpy(n, e820_table, size); + e820_table = n; - size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries; n = kmalloc(size, GFP_KERNEL); BUG_ON(!n); - memcpy(n, e820_saved, size); - e820_saved = n; + memcpy(n, e820_table_firmware, size); + e820_table_firmware = n; } -/** - * Because of the size limitation of struct boot_params, only first - * 128 E820 memory entries are passed to kernel via - * boot_params.e820_map, others are passed via SETUP_E820_EXT node of - * linked list of struct setup_data, which is parsed here. +/* + * Because of the small fixed size of struct boot_params, only the first + * 128 E820 memory entries are passed to the kernel via boot_params.e820_table, + * the remaining (if any) entries are passed via the SETUP_E820_EXT node of + * struct setup_data, which is parsed here. */ -void __init parse_e820_ext(u64 phys_addr, u32 data_len) +void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) { int entries; - struct e820entry *extmap; + struct boot_e820_entry *extmap; struct setup_data *sdata; sdata = early_memremap(phys_addr, data_len); - entries = sdata->len / sizeof(struct e820entry); - extmap = (struct e820entry *)(sdata->data); - __append_e820_map(extmap, entries); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + entries = sdata->len / sizeof(*extmap); + extmap = (struct boot_e820_entry *)(sdata->data); + + __append_e820_table(extmap, entries); + e820__update_table(e820_table); + early_memunmap(sdata, data_len); - printk(KERN_INFO "e820: extended physical RAM map:\n"); - e820_print_map("extended"); + pr_info("e820: extended physical RAM map:\n"); + e820__print_table("extended"); } -#if defined(CONFIG_X86_64) || \ - (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) -/** +/* * Find the ranges of physical addresses that do not correspond to - * e820 RAM areas and mark the corresponding pages as nosave for - * hibernation (32 bit) or software suspend and suspend to RAM (64 bit). + * E820 RAM areas and register the corresponding pages as 'nosave' for + * hibernation (32-bit) or software suspend and suspend to RAM (64-bit). * - * This function requires the e820 map to be sorted and without any + * This function requires the E820 map to be sorted and without any * overlapping entries. */ -void __init e820_mark_nosave_regions(unsigned long limit_pfn) +void __init e820__register_nosave_regions(unsigned long limit_pfn) { int i; unsigned long pfn = 0; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (pfn < PFN_UP(ei->addr)) - register_nosave_region(pfn, PFN_UP(ei->addr)); + if (pfn < PFN_UP(entry->addr)) + register_nosave_region(pfn, PFN_UP(entry->addr)); - pfn = PFN_DOWN(ei->addr + ei->size); + pfn = PFN_DOWN(entry->addr + entry->size); - if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) - register_nosave_region(PFN_UP(ei->addr), pfn); + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + register_nosave_region(PFN_UP(entry->addr), pfn); if (pfn >= limit_pfn) break; } } -#endif #ifdef CONFIG_ACPI -/** - * Mark ACPI NVS memory region, so that we can save/restore it during - * hibernation and the subsequent resume. +/* + * Register ACPI NVS memory regions, so that we can save/restore them during + * hibernation and the subsequent resume: */ -static int __init e820_mark_nvs_memory(void) +static int __init e820__register_nvs_regions(void) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (ei->type == E820_NVS) - acpi_nvs_register(ei->addr, ei->size); + if (entry->type == E820_TYPE_NVS) + acpi_nvs_register(entry->addr, entry->size); } return 0; } -core_initcall(e820_mark_nvs_memory); +core_initcall(e820__register_nvs_regions); #endif /* - * pre allocated 4k and reserved it in memblock and e820_saved + * Allocate the requested number of bytes with the requsted alignment + * and return (the physical address) to the caller. Also register this + * range in the 'firmware' E820 table as a reserved range. + * + * This allows kexec to fake a new mptable, as if it came from the real + * system. */ -u64 __init early_reserve_e820(u64 size, u64 align) +u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) { u64 addr; addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { - e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); - printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n"); - update_e820_saved(); + e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + pr_info("e820: update e820_table_firmware for e820__memblock_alloc_reserved()\n"); + e820__update_table_firmware(); } return addr; @@ -779,22 +760,22 @@ u64 __init early_reserve_e820(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type) { int i; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; unsigned long start_pfn; unsigned long end_pfn; - if (ei->type != type) + if (entry->type != type) continue; - start_pfn = ei->addr >> PAGE_SHIFT; - end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; + start_pfn = entry->addr >> PAGE_SHIFT; + end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT; if (start_pfn >= limit_pfn) continue; @@ -809,18 +790,19 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; - printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n", + pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; } -unsigned long __init e820_end_of_ram_pfn(void) + +unsigned long __init e820__end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); + return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM); } -unsigned long __init e820_end_of_low_ram_pfn(void) +unsigned long __init e820__end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM); + return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM); } static void __init early_panic(char *msg) @@ -831,7 +813,7 @@ static void __init early_panic(char *msg) static int userdef __initdata; -/* "mem=nopentium" disables the 4MB page tables. */ +/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */ static int __init parse_memopt(char *p) { u64 mem_size; @@ -844,17 +826,19 @@ static int __init parse_memopt(char *p) setup_clear_cpu_cap(X86_FEATURE_PSE); return 0; #else - printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + pr_warn("mem=nopentium ignored! (only supported on x86_32)\n"); return -EINVAL; #endif } userdef = 1; mem_size = memparse(p, &p); - /* don't remove all of memory when handling "mem={invalid}" param */ + + /* Don't remove all memory when getting "mem={invalid}" parameter: */ if (mem_size == 0) return -EINVAL; - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); return 0; } @@ -872,12 +856,12 @@ static int __init parse_memmap_one(char *p) #ifdef CONFIG_CRASH_DUMP /* * If we are doing a crash dump, we still need to know - * the real mem size before original memory map is + * the real memory size before the original memory map is * reset. */ - saved_max_pfn = e820_end_of_ram_pfn(); + saved_max_pfn = e820__end_of_ram_pfn(); #endif - e820->nr_map = 0; + e820_table->nr_entries = 0; userdef = 1; return 0; } @@ -890,21 +874,23 @@ static int __init parse_memmap_one(char *p) userdef = 1; if (*p == '@') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_RAM); + e820__range_add(start_at, mem_size, E820_TYPE_RAM); } else if (*p == '#') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_ACPI); + e820__range_add(start_at, mem_size, E820_TYPE_ACPI); } else if (*p == '$') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_RESERVED); + e820__range_add(start_at, mem_size, E820_TYPE_RESERVED); } else if (*p == '!') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_PRAM); - } else - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + e820__range_add(start_at, mem_size, E820_TYPE_PRAM); + } else { + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + } return *p == '\0' ? 0 : -EINVAL; } + static int __init parse_memmap_opt(char *str) { while (str) { @@ -921,68 +907,97 @@ static int __init parse_memmap_opt(char *str) } early_param("memmap", parse_memmap_opt); -void __init finish_e820_parsing(void) +/* + * Reserve all entries from the bootloader's extensible data nodes list, + * because if present we are going to use it later on to fetch e820 + * entries from it: + */ +void __init e820__reserve_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return; + + while (pa_data) { + data = early_memremap(pa_data, sizeof(*data)); + e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + pa_data = data->next; + early_memunmap(data, sizeof(*data)); + } + + e820__update_table(e820_table); + + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + + pr_info("extended physical RAM map:\n"); + e820__print_table("reserve setup_data"); +} + +/* + * Called after parse_early_param(), after early parameters (such as mem=) + * have been processed, in which case we already have an E820 table filled in + * via the parameter callback function(s), but it's not sorted and printed yet: + */ +void __init e820__finish_early_params(void) { if (userdef) { - if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), - &e820->nr_map) < 0) + if (e820__update_table(e820_table) < 0) early_panic("Invalid user supplied memory map"); - printk(KERN_INFO "e820: user-defined physical RAM map:\n"); - e820_print_map("user"); + pr_info("e820: user-defined physical RAM map:\n"); + e820__print_table("user"); } } -static const char *__init e820_type_to_string(int e820_type) +static const char *__init e820_type_to_string(struct e820_entry *entry) { - switch (e820_type) { - case E820_RESERVED_KERN: - case E820_RAM: return "System RAM"; - case E820_ACPI: return "ACPI Tables"; - case E820_NVS: return "ACPI Non-volatile Storage"; - case E820_UNUSABLE: return "Unusable memory"; - case E820_PRAM: return "Persistent Memory (legacy)"; - case E820_PMEM: return "Persistent Memory"; - default: return "reserved"; + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return "System RAM"; + case E820_TYPE_ACPI: return "ACPI Tables"; + case E820_TYPE_NVS: return "ACPI Non-volatile Storage"; + case E820_TYPE_UNUSABLE: return "Unusable memory"; + case E820_TYPE_PRAM: return "Persistent Memory (legacy)"; + case E820_TYPE_PMEM: return "Persistent Memory"; + case E820_TYPE_RESERVED: return "Reserved"; + default: return "Unknown E820 type"; } } -static unsigned long __init e820_type_to_iomem_type(int e820_type) +static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) { - switch (e820_type) { - case E820_RESERVED_KERN: - case E820_RAM: - return IORESOURCE_SYSTEM_RAM; - case E820_ACPI: - case E820_NVS: - case E820_UNUSABLE: - case E820_PRAM: - case E820_PMEM: - default: - return IORESOURCE_MEM; + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; + case E820_TYPE_ACPI: /* Fall-through: */ + case E820_TYPE_NVS: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + case E820_TYPE_PRAM: /* Fall-through: */ + case E820_TYPE_PMEM: /* Fall-through: */ + case E820_TYPE_RESERVED: /* Fall-through: */ + default: return IORESOURCE_MEM; } } -static unsigned long __init e820_type_to_iores_desc(int e820_type) +static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) { - switch (e820_type) { - case E820_ACPI: - return IORES_DESC_ACPI_TABLES; - case E820_NVS: - return IORES_DESC_ACPI_NV_STORAGE; - case E820_PMEM: - return IORES_DESC_PERSISTENT_MEMORY; - case E820_PRAM: - return IORES_DESC_PERSISTENT_MEMORY_LEGACY; - case E820_RESERVED_KERN: - case E820_RAM: - case E820_UNUSABLE: - default: - return IORES_DESC_NONE; + switch (entry->type) { + case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES; + case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE; + case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; + case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + case E820_TYPE_RESERVED: /* Fall-through: */ + default: return IORES_DESC_NONE; } } -static bool __init do_mark_busy(u32 type, struct resource *res) +static bool __init do_mark_busy(enum e820_type type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ if (res->start < (1ULL<<20)) @@ -993,61 +1008,71 @@ static bool __init do_mark_busy(u32 type, struct resource *res) * for exclusive use of a driver */ switch (type) { - case E820_RESERVED: - case E820_PRAM: - case E820_PMEM: + case E820_TYPE_RESERVED: + case E820_TYPE_PRAM: + case E820_TYPE_PMEM: return false; + case E820_TYPE_RESERVED_KERN: + case E820_TYPE_RAM: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: default: return true; } } /* - * Mark e820 reserved areas as busy for the resource manager. + * Mark E820 reserved areas as busy for the resource manager: */ + static struct resource __initdata *e820_res; -void __init e820_reserve_resources(void) + +void __init e820__reserve_resources(void) { int i; struct resource *res; u64 end; - res = alloc_bootmem(sizeof(struct resource) * e820->nr_map); + res = alloc_bootmem(sizeof(*res) * e820_table->nr_entries); e820_res = res; - for (i = 0; i < e820->nr_map; i++) { - end = e820->map[i].addr + e820->map[i].size - 1; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = e820_table->entries + i; + + end = entry->addr + entry->size - 1; if (end != (resource_size_t)end) { res++; continue; } - res->name = e820_type_to_string(e820->map[i].type); - res->start = e820->map[i].addr; - res->end = end; - - res->flags = e820_type_to_iomem_type(e820->map[i].type); - res->desc = e820_type_to_iores_desc(e820->map[i].type); + res->start = entry->addr; + res->end = end; + res->name = e820_type_to_string(entry); + res->flags = e820_type_to_iomem_type(entry); + res->desc = e820_type_to_iores_desc(entry); /* - * don't register the region that could be conflicted with - * pci device BAR resource and insert them later in - * pcibios_resource_survey() + * Don't register the region that could be conflicted with + * PCI device BAR resources and insert them later in + * pcibios_resource_survey(): */ - if (do_mark_busy(e820->map[i].type, res)) { + if (do_mark_busy(entry->type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } res++; } - for (i = 0; i < e820_saved->nr_map; i++) { - struct e820entry *entry = &e820_saved->map[i]; - firmware_map_add_early(entry->addr, - entry->addr + entry->size, - e820_type_to_string(entry->type)); + for (i = 0; i < e820_table_firmware->nr_entries; i++) { + struct e820_entry *entry = e820_table_firmware->entries + i; + + firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); } } -/* How much should we pad RAM ending depending on where it is? */ +/* + * How much should we pad the end of RAM, depending on where it is? + */ static unsigned long __init ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; @@ -1066,64 +1091,59 @@ static unsigned long __init ram_alignment(resource_size_t pos) #define MAX_RESOURCE_SIZE ((resource_size_t)-1) -void __init e820_reserve_resources_late(void) +void __init e820__reserve_resources_late(void) { int i; struct resource *res; res = e820_res; - for (i = 0; i < e820->nr_map; i++) { + for (i = 0; i < e820_table->nr_entries; i++) { if (!res->parent && res->end) insert_resource_expand_to_fit(&iomem_resource, res); res++; } /* - * Try to bump up RAM regions to reasonable boundaries to + * Try to bump up RAM regions to reasonable boundaries, to * avoid stolen RAM: */ - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *entry = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; u64 start, end; - if (entry->type != E820_RAM) + if (entry->type != E820_TYPE_RAM) continue; + start = entry->addr + entry->size; end = round_up(start, ram_alignment(start)) - 1; if (end > MAX_RESOURCE_SIZE) end = MAX_RESOURCE_SIZE; if (start >= end) continue; - printk(KERN_DEBUG - "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", - start, end); - reserve_region_with_split(&iomem_resource, start, end, - "RAM buffer"); + + printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end); + reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } } -char *__init default_machine_specific_memory_setup(void) +/* + * Pass the firmware (bootloader) E820 map to the kernel and process it: + */ +char *__init e820__memory_setup_default(void) { char *who = "BIOS-e820"; - u32 new_nr; + /* * Try to copy the BIOS-supplied E820-map. * * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - new_nr = boot_params.e820_entries; - sanitize_e820_map(boot_params.e820_map, - ARRAY_SIZE(boot_params.e820_map), - &new_nr); - boot_params.e820_entries = new_nr; - if (append_e820_map(boot_params.e820_map, boot_params.e820_entries) - < 0) { + if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) { u64 mem_size; - /* compare results from other methods and take the greater */ - if (boot_params.alt_mem_k - < boot_params.screen_info.ext_mem_k) { + /* Compare results from other methods and take the one that gives more RAM: */ + if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) { mem_size = boot_params.screen_info.ext_mem_k; who = "BIOS-88"; } else { @@ -1131,84 +1151,68 @@ char *__init default_machine_specific_memory_setup(void) who = "BIOS-e801"; } - e820->nr_map = 0; - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); + e820_table->nr_entries = 0; + e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM); + e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM); } - /* In case someone cares... */ + /* We just appended a lot of ranges, sanitize the table: */ + e820__update_table(e820_table); + return who; } -void __init setup_memory_map(void) +/* + * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader + * E820 map - with an optional platform quirk available for virtual platforms + * to override this method of boot environment processing: + */ +void __init e820__memory_setup(void) { char *who; + /* This is a firmware interface ABI - make sure we don't break it: */ + BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20); + who = x86_init.resources.memory_setup(); - memcpy(e820_saved, e820, sizeof(struct e820map)); - printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); - e820_print_map(who); + + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + + pr_info("e820: BIOS-provided physical RAM map:\n"); + e820__print_table(who); } -void __init memblock_x86_fill(void) +void __init e820__memblock_setup(void) { int i; u64 end; /* - * EFI may have more than 128 entries - * We are safe to enable resizing, beause memblock_x86_fill() - * is rather later for x86 + * The bootstrap memblock region count maximum is 128 entries + * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries + * than that - so allow memblock resizing. + * + * This is safe, because this call happens pretty late during x86 setup, + * so we know about reserved memory regions already. (This is important + * so that memblock resizing does no stomp over reserved areas.) */ memblock_allow_resize(); - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - end = ei->addr + ei->size; + end = entry->addr + entry->size; if (end != (resource_size_t)end) continue; - if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) continue; - memblock_add(ei->addr, ei->size); + memblock_add(entry->addr, entry->size); } - /* throw away partial pages */ + /* Throw away partial pages: */ memblock_trim_memory(PAGE_SIZE); memblock_dump_all(); } - -void __init memblock_find_dma_reserve(void) -{ -#ifdef CONFIG_X86_64 - u64 nr_pages = 0, nr_free_pages = 0; - unsigned long start_pfn, end_pfn; - phys_addr_t start, end; - int i; - u64 u; - - /* - * need to find out used area below MAX_DMA_PFN - * need to use memblock to get free size in [0, MAX_DMA_PFN] - * at first, and assume boot_mem will not take below MAX_DMA_PFN - */ - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min(start_pfn, MAX_DMA_PFN); - end_pfn = min(end_pfn, MAX_DMA_PFN); - nr_pages += end_pfn - start_pfn; - } - - for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, - NULL) { - start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); - end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); - if (start_pfn < end_pfn) - nr_free_pages += end_pfn - start_pfn; - } - - set_dma_reserve(nr_pages - nr_free_pages); -#endif -} diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6a08e25a48d8..ff7e4b3988ed 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -546,8 +546,8 @@ intel_graphics_stolen(int num, int slot, int func, &base, &end); /* Mark this space as reserved */ - e820_add_region(base, size, E820_RESERVED); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__range_add(base, size, E820_TYPE_RESERVED); + e820__update_table(e820_table); } static void __init intel_graphics_quirks(int num, int slot, int func) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8f3d9cf26ff9..5b7153540727 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -983,6 +983,18 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long return_hooker = (unsigned long) &return_to_handler; + /* + * When resuming from suspend-to-ram, this function can be indirectly + * called from early CPU startup code while the CPU is in real mode, + * which would fail miserably. Make sure the stack pointer is a + * virtual address. + * + * This check isn't as accurate as virt_addr_valid(), but it should be + * good enough for this purpose, and it's fast. + */ + if (unlikely((long)__builtin_frame_address(0) >= 0)) + return; + if (unlikely(ftrace_graph_is_dead())) return; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index e5fb436a6548..538ec012b371 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -12,7 +12,7 @@ #include <asm/setup.h> #include <asm/sections.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/page.h> #include <asm/apic.h> #include <asm/io_apic.h> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b5785c197e53..43b7002f44fb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -24,7 +24,7 @@ #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/kdebug.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/bios_ebda.h> #include <asm/bootparam_utils.h> #include <asm/microcode.h> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b467b14b03eb..ac9d327d2e42 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -269,10 +269,8 @@ ENTRY(secondary_startup_64) /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi - jmp start_cpu -ENDPROC(secondary_startup_64) -ENTRY(start_cpu) +.Ljump_to_C_code: /* * Jump to run C code and to be on a real kernel address. * Since we are running on identity-mapped space we have to jump @@ -305,7 +303,7 @@ ENTRY(start_cpu) pushq %rax # target address in negative space lretq .Lafter_lret: -ENDPROC(start_cpu) +ENDPROC(secondary_startup_64) #include "verify_cpu.S" @@ -313,11 +311,11 @@ ENDPROC(start_cpu) /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set * up already except stack. We just set up stack here. Then call - * start_secondary() via start_cpu(). + * start_secondary() via .Ljump_to_C_code. */ ENTRY(start_cpu0) movq initial_stack(%rip), %rsp - jmp start_cpu + jmp .Ljump_to_C_code ENDPROC(start_cpu0) #endif diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index d0a814a9d96a..9d7fd5e6689a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -25,6 +25,7 @@ #include <asm/setup.h> #include <asm/crash.h> #include <asm/efi.h> +#include <asm/e820/api.h> #include <asm/kexec-bzimage64.h> #define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ @@ -99,15 +100,14 @@ static int setup_e820_entries(struct boot_params *params) { unsigned int nr_e820_entries; - nr_e820_entries = e820_saved->nr_map; + nr_e820_entries = e820_table_firmware->nr_entries; - /* TODO: Pass entries more than E820MAX in bootparams setup data */ - if (nr_e820_entries > E820MAX) - nr_e820_entries = E820MAX; + /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */ + if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE) + nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE; params->e820_entries = nr_e820_entries; - memcpy(¶ms->e820_map, &e820_saved->map, - nr_e820_entries * sizeof(struct e820entry)); + memcpy(¶ms->e820_table, &e820_table_firmware->entries, nr_e820_entries*sizeof(struct e820_entry)); return 0; } @@ -232,10 +232,10 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, nr_e820_entries = params->e820_entries; for (i = 0; i < nr_e820_entries; i++) { - if (params->e820_map[i].type != E820_RAM) + if (params->e820_table[i].type != E820_TYPE_RAM) continue; - start = params->e820_map[i].addr; - end = params->e820_map[i].addr + params->e820_map[i].size - 1; + start = params->e820_table[i].addr; + end = params->e820_table[i].addr + params->e820_table[i].size - 1; if ((start <= 0x100000) && end > 0x100000) { mem_k = (end >> 10) - (0x100000 >> 10); diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index d688826e5736..db2182d63ed0 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -67,7 +67,7 @@ #endif /* Ensure if the instruction can be boostable */ -extern int can_boost(kprobe_opcode_t *instruction, void *addr); +extern int can_boost(struct insn *insn, void *orig_addr); /* Recover instruction if given address is probed */ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr); @@ -75,7 +75,7 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, * Copy an instruction and adjust the displacement if the instruction * uses the %rip-relative addressing mode. */ -extern int __copy_instruction(u8 *dest, u8 *src); +extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn); /* Generate a relative-jump/call instruction */ extern void synthesize_reljump(void *from, void *to); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 993fa4fe4f68..19e1f2a6d7b0 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -164,42 +164,38 @@ static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn) NOKPROBE_SYMBOL(skip_prefixes); /* - * Returns non-zero if opcode is boostable. + * Returns non-zero if INSN is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -int can_boost(kprobe_opcode_t *opcodes, void *addr) +int can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; - kprobe_opcode_t *orig_opcodes = opcodes; if (search_exception_tables((unsigned long)addr)) return 0; /* Page fault may occur on this address. */ -retry: - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - opcode = *(opcodes++); - /* 2nd-byte opcode */ - if (opcode == 0x0f) { - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - return test_bit(*opcodes, + if (insn->opcode.nbytes == 2) + return test_bit(insn->opcode.bytes[1], (unsigned long *)twobyte_is_boostable); - } + + if (insn->opcode.nbytes != 1) + return 0; + + /* Can't boost Address-size override prefix */ + if (unlikely(inat_is_address_size_prefix(insn->attr))) + return 0; + + opcode = insn->opcode.bytes[0]; switch (opcode & 0xf0) { -#ifdef CONFIG_X86_64 - case 0x40: - goto retry; /* REX prefix is boostable */ -#endif case 0x60: - if (0x63 < opcode && opcode < 0x67) - goto retry; /* prefixes */ - /* can't boost Address-size override and bound */ - return (opcode != 0x62 && opcode != 0x67); + /* can't boost "bound" */ + return (opcode != 0x62); case 0x70: return 0; /* can't boost conditional jump */ + case 0x90: + return opcode != 0x9a; /* can't boost call far */ case 0xc0: /* can't boost software-interruptions */ return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; @@ -210,14 +206,9 @@ retry: /* can boost in/out and absolute jmps */ return ((opcode & 0x04) || opcode == 0xea); case 0xf0: - if ((opcode & 0x0c) == 0 && opcode != 0xf1) - goto retry; /* lock/rep(ne) prefix */ /* clear and set flags are boostable */ return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); default: - /* segment override prefixes are boostable */ - if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) - goto retry; /* prefixes */ /* CS override prefix and call are not boostable */ return (opcode != 0x2e && opcode != 0x9a); } @@ -264,7 +255,10 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Fortunately, we know that the original code is the ideal 5-byte * long NOP. */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (probe_kernel_read(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + if (faddr) memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); else @@ -276,7 +270,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Recover the probed instruction at addr for further analysis. * Caller must lock kprobes by kprobe_mutex, or disable preemption * for preventing to release referencing kprobes. - * Returns zero if the instruction can not get recovered. + * Returns zero if the instruction can not get recovered (or access failed). */ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) { @@ -348,37 +342,36 @@ static int is_IF_modifier(kprobe_opcode_t *insn) } /* - * Copy an instruction and adjust the displacement if the instruction - * uses the %rip-relative addressing mode. - * If it does, Return the address of the 32-bit displacement word. - * If not, return null. - * Only applicable to 64-bit x86. + * Copy an instruction with recovering modified instruction by kprobes + * and adjust the displacement if the instruction uses the %rip-relative + * addressing mode. + * This returns the length of copied instruction, or 0 if it has an error. */ -int __copy_instruction(u8 *dest, u8 *src) +int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) { - struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - int length; unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); - if (!recovered_insn) + if (!recovered_insn || !insn) + return 0; + + /* This can access kernel text if given address is not recovered */ + if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE)) return 0; - kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); - insn_get_length(&insn); - length = insn.length; + + kernel_insn_init(insn, dest, MAX_INSN_SIZE); + insn_get_length(insn); /* Another subsystem puts a breakpoint, failed to recover */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; - memcpy(dest, insn.kaddr, length); #ifdef CONFIG_X86_64 - if (insn_rip_relative(&insn)) { + /* Only x86_64 has RIP relative instructions */ + if (insn_rip_relative(insn)) { s64 newdisp; u8 *disp; - kernel_insn_init(&insn, dest, length); - insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing * mode. Adjust the displacement for the difference between @@ -391,36 +384,57 @@ int __copy_instruction(u8 *dest, u8 *src) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; + newdisp = (u8 *) src + (s64) insn->displacement.value + - (u8 *) dest; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); - pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value); + pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", + src, dest, insn->displacement.value); return 0; } - disp = (u8 *) dest + insn_offset_displacement(&insn); + disp = (u8 *) dest + insn_offset_displacement(insn); *(s32 *) disp = (s32) newdisp; } #endif - return length; + return insn->length; +} + +/* Prepare reljump right after instruction to boost */ +static void prepare_boost(struct kprobe *p, struct insn *insn) +{ + if (can_boost(insn, p->addr) && + MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + synthesize_reljump(p->ainsn.insn + insn->length, + p->addr + insn->length); + p->ainsn.boostable = true; + } else { + p->ainsn.boostable = false; + } } static int arch_copy_kprobe(struct kprobe *p) { - int ret; + struct insn insn; + int len; + + set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1); /* Copy an instruction with recovering if other optprobe modifies it.*/ - ret = __copy_instruction(p->ainsn.insn, p->addr); - if (!ret) + len = __copy_instruction(p->ainsn.insn, p->addr, &insn); + if (!len) return -EINVAL; /* * __copy_instruction can modify the displacement of the instruction, * but it doesn't affect boostable check. */ - if (can_boost(p->ainsn.insn, p->addr)) - p->ainsn.boostable = 0; - else - p->ainsn.boostable = -1; + prepare_boost(p, &insn); + + set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1); /* Check whether the instruction modifies Interrupt Flag or not */ p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); @@ -459,7 +473,7 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { - free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); + free_insn_slot(p->ainsn.insn, p->ainsn.boostable); p->ainsn.insn = NULL; } } @@ -531,7 +545,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, return; #if !defined(CONFIG_PREEMPT) - if (p->ainsn.boostable == 1 && !p->post_handler) { + if (p->ainsn.boostable && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) reset_current_kprobe(); @@ -851,7 +865,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, case 0xcf: case 0xea: /* jmp absolute -- ip is correct */ /* ip is already adjusted, no more changes required */ - p->ainsn.boostable = 1; + p->ainsn.boostable = true; goto no_change; case 0xe8: /* call relative - Fix return addr */ *tos = orig_ip + (*tos - copy_ip); @@ -876,28 +890,13 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, * jmp near and far, absolute indirect * ip is correct. And this is boostable */ - p->ainsn.boostable = 1; + p->ainsn.boostable = true; goto no_change; } default: break; } - if (p->ainsn.boostable == 0) { - if ((regs->ip > copy_ip) && - (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) { - /* - * These instructions can be executed directly if it - * jumps back to correct address. - */ - synthesize_reljump((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); - p->ainsn.boostable = 1; - } else { - p->ainsn.boostable = -1; - } - } - regs->ip += orig_ip - copy_ip; no_change: diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 5f8f0b3cc674..041f7b6dfa0f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -94,6 +94,6 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler); int arch_prepare_kprobe_ftrace(struct kprobe *p) { p->ainsn.insn = NULL; - p->ainsn.boostable = -1; + p->ainsn.boostable = false; return 0; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3e7c6e5a08ff..9aadff3d0902 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -65,7 +65,10 @@ found: * overwritten by jump destination address. In this case, original * bytes must be recovered from op->optinsn.copied_insn buffer. */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (probe_kernel_read(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + if (addr == (unsigned long)kp->addr) { buf[0] = kp->opcode; memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); @@ -174,11 +177,12 @@ NOKPROBE_SYMBOL(optimized_callback); static int copy_optimized_instructions(u8 *dest, u8 *src) { + struct insn insn; int len = 0, ret; while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len); - if (!ret || !can_boost(dest + len, src + len)) + ret = __copy_instruction(dest + len, src + len, &insn); + if (!ret || !can_boost(&insn, src + len)) return -EINVAL; len += ret; } @@ -350,6 +354,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, } buf = (u8 *)op->optinsn.insn; + set_memory_rw((unsigned long)buf & PAGE_MASK, 1); /* Copy instructions into the out-of-line buffer */ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); @@ -372,6 +377,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, (u8 *)op->kp.addr + op->optinsn.size); + set_memory_ro((unsigned long)buf & PAGE_MASK, 1); + flush_icache_range((unsigned long) buf, (unsigned long) buf + TMPL_END_IDX + op->optinsn.size + RELATIVEJUMP_SIZE); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0f8d20497383..0d904d759ff1 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -26,7 +26,7 @@ #include <asm/io_apic.h> #include <asm/proto.h> #include <asm/bios_ebda.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/setup.h> #include <asm/smp.h> @@ -826,10 +826,10 @@ static int __init parse_alloc_mptable_opt(char *p) } early_param("alloc_mptable", parse_alloc_mptable_opt); -void __init early_reserve_e820_mpc_new(void) +void __init e820__memblock_alloc_reserved_mpc_new(void) { if (enable_update_mptable && alloc_mptable) - mpc_new_phys = early_reserve_e820(mpc_new_length, 4); + mpc_new_phys = e820__memblock_alloc_reserved(mpc_new_length, 4); } static int __init update_mp_table(void) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index a723ae9440ab..446c8aa09b9b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -222,17 +222,6 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", reason, smp_processor_id()); - /* - * On some machines, PCI SERR line is used to report memory - * errors. EDAC makes use of it. - */ -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - if (panic_on_unrecovered_nmi) nmi_panic(regs, "NMI: Not continuing"); diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index d5f15c3f7b25..963e3fb56437 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -14,7 +14,7 @@ #include <asm/probe_roms.h> #include <asm/pci-direct.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mmzone.h> #include <asm/setup.h> #include <asm/sections.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f67591561711..0bb88428cbf2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -37,6 +37,7 @@ #include <asm/vm86.h> #include <asm/switch_to.h> #include <asm/desc.h> +#include <asm/prctl.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -124,11 +125,6 @@ void flush_thread(void) fpu__clear(&tsk->thread.fpu); } -static void hard_disable_TSC(void) -{ - cr4_set_bits(X86_CR4_TSD); -} - void disable_TSC(void) { preempt_disable(); @@ -137,15 +133,10 @@ void disable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_disable_TSC(); + cr4_set_bits(X86_CR4_TSD); preempt_enable(); } -static void hard_enable_TSC(void) -{ - cr4_clear_bits(X86_CR4_TSD); -} - static void enable_TSC(void) { preempt_disable(); @@ -154,7 +145,7 @@ static void enable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_enable_TSC(); + cr4_clear_bits(X86_CR4_TSD); preempt_enable(); } @@ -182,54 +173,129 @@ int set_tsc_mode(unsigned int val) return 0; } -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss) -{ - struct thread_struct *prev, *next; - - prev = &prev_p->thread; - next = &next_p->thread; +DEFINE_PER_CPU(u64, msr_misc_features_shadow); - if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ - test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); +static void set_cpuid_faulting(bool on) +{ + u64 msrval; - debugctl &= ~DEBUGCTLMSR_BTF; - if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) - debugctl |= DEBUGCTLMSR_BTF; + msrval = this_cpu_read(msr_misc_features_shadow); + msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; + msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); + this_cpu_write(msr_misc_features_shadow, msrval); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); +} - update_debugctlmsr(debugctl); +static void disable_cpuid(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(true); } + preempt_enable(); +} - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); +static void enable_cpuid(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(false); } + preempt_enable(); +} + +static int get_cpuid_mode(void) +{ + return !test_thread_flag(TIF_NOCPUID); +} + +static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +{ + if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) + return -ENODEV; + + if (cpuid_enabled) + enable_cpuid(); + else + disable_cpuid(); + + return 0; +} + +/* + * Called immediately after a successful exec. + */ +void arch_setup_new_exec(void) +{ + /* If cpuid was previously disabled for this task, re-enable it. */ + if (test_thread_flag(TIF_NOCPUID)) + enable_cpuid(); +} - if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { +static inline void switch_to_bitmap(struct tss_struct *tss, + struct thread_struct *prev, + struct thread_struct *next, + unsigned long tifp, unsigned long tifn) +{ + if (tifn & _TIF_IO_BITMAP) { /* * Copy the relevant range of the IO bitmap. * Normally this is 128 bytes or less: */ memcpy(tss->io_bitmap, next->io_bitmap_ptr, max(prev->io_bitmap_max, next->io_bitmap_max)); - /* * Make sure that the TSS limit is correct for the CPU * to notice the IO bitmap. */ refresh_tss_limit(); - } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + } else if (tifp & _TIF_IO_BITMAP) { /* * Clear any possible leftover bits: */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + unsigned long tifp, tifn; + + prev = &prev_p->thread; + next = &next_p->thread; + + tifn = READ_ONCE(task_thread_info(next_p)->flags); + tifp = READ_ONCE(task_thread_info(prev_p)->flags); + switch_to_bitmap(tss, prev, next, tifp, tifn); + propagate_user_return_notify(prev_p, next_p); + + if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && + arch_has_block_step()) { + unsigned long debugctl, msk; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~DEBUGCTLMSR_BTF; + msk = tifn & _TIF_BLOCKSTEP; + debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + + if ((tifp ^ tifn) & _TIF_NOTSC) + cr4_toggle_bits(X86_CR4_TSD); + + if ((tifp ^ tifn) & _TIF_NOCPUID) + set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); } /* @@ -550,3 +616,16 @@ out: put_task_stack(p); return ret; } + +long do_arch_prctl_common(struct task_struct *task, int option, + unsigned long cpuid_enabled) +{ + switch (option) { + case ARCH_GET_CPUID: + return get_cpuid_mode(); + case ARCH_SET_CPUID: + return set_cpuid_mode(task, cpuid_enabled); + } + + return -EINVAL; +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4c818f8bc135..ff40e74c9181 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/ldt.h> @@ -56,6 +57,7 @@ #include <asm/switch_to.h> #include <asm/vm86.h> #include <asm/intel_rdt.h> +#include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) { @@ -304,3 +306,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(current, option, arg2); +} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d6b784a5520d..ea1a6180bf39 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -204,7 +205,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, tls); + err = do_arch_prctl_64(p, ARCH_SET_FS, tls); if (err) goto out; } @@ -547,70 +548,72 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) } #endif -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; int doit = task == current; int cpu; - switch (code) { + switch (option) { case ARCH_SET_GS: - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.gsindex = 0; - task->thread.gsbase = addr; + task->thread.gsbase = arg2; if (doit) { load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2); } put_cpu(); break; case ARCH_SET_FS: /* Not strictly needed for fs, but do it for symmetry with gs */ - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.fsindex = 0; - task->thread.fsbase = addr; + task->thread.fsbase = arg2; if (doit) { /* set the selector to 0 to not confuse __switch_to */ loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, addr); + ret = wrmsrl_safe(MSR_FS_BASE, arg2); } put_cpu(); break; case ARCH_GET_FS: { unsigned long base; + if (doit) rdmsrl(MSR_FS_BASE, base); else base = task->thread.fsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { unsigned long base; + if (doit) rdmsrl(MSR_KERNEL_GS_BASE, base); else base = task->thread.gsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } #ifdef CONFIG_CHECKPOINT_RESTORE # ifdef CONFIG_X86_X32_ABI case ARCH_MAP_VDSO_X32: - return prctl_map_vdso(&vdso_image_x32, addr); + return prctl_map_vdso(&vdso_image_x32, arg2); # endif # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: - return prctl_map_vdso(&vdso_image_32, addr); + return prctl_map_vdso(&vdso_image_32, arg2); # endif case ARCH_MAP_VDSO_64: - return prctl_map_vdso(&vdso_image_64, addr); + return prctl_map_vdso(&vdso_image_64, arg2); #endif default: @@ -621,10 +624,23 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) return ret; } -long sys_arch_prctl(int code, unsigned long addr) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + long ret; + + ret = do_arch_prctl_64(current, option, arg2); + if (ret == -EINVAL) + ret = do_arch_prctl_common(current, option, arg2); + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION +COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl(current, code, addr); + return do_arch_prctl_common(current, option, arg2); } +#endif unsigned long KSTK_ESP(struct task_struct *task) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2364b23ea3e5..f37d18124648 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -396,12 +396,12 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; /* - * When changing the segment base, use do_arch_prctl + * When changing the segment base, use do_arch_prctl_64 * to set either thread.fs or thread.fsindex and the * corresponding GDT slot. */ if (child->thread.fsbase != value) - return do_arch_prctl(child, ARCH_SET_FS, value); + return do_arch_prctl_64(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): /* @@ -410,7 +410,7 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; if (child->thread.gsbase != value) - return do_arch_prctl(child, ARCH_SET_GS, value); + return do_arch_prctl_64(child, ARCH_SET_GS, value); return 0; #endif } @@ -869,7 +869,7 @@ long arch_ptrace(struct task_struct *child, long request, Works just like arch_prctl, except that the arguments are reversed. */ case PTRACE_ARCH_PRCTL: - ret = do_arch_prctl(child, data, addr); + ret = do_arch_prctl_64(child, data, addr); break; #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 067f9813fd2c..2544700a2a87 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -765,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs) #endif +/* This is the CPU performing the emergency shutdown work. */ +int crashing_cpu = -1; + #if defined(CONFIG_SMP) -/* This keeps a track of which one is crashing cpu. */ -static int crashing_cpu; static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 2408c1603438..5ab3895516ac 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,5 +1,5 @@ #include <linux/ioport.h> -#include <asm/e820.h> +#include <asm/e820/api.h> static void resource_clip(struct resource *res, resource_size_t start, resource_size_t end) @@ -25,10 +25,10 @@ static void resource_clip(struct resource *res, resource_size_t start, static void remove_e820_regions(struct resource *avail) { int i; - struct e820entry *entry; + struct e820_entry *entry; - for (i = 0; i < e820->nr_map; i++) { - entry = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + entry = &e820_table->entries[i]; resource_clip(avail, entry->addr, entry->addr + entry->size - 1); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4bf0c8926a1c..62a1c74855e5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -75,7 +75,7 @@ #include <asm/mtrr.h> #include <asm/apic.h> #include <asm/realmode.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/efi.h> @@ -119,7 +119,7 @@ * max_low_pfn_mapped: highest direct mapped pfn under 4GB * max_pfn_mapped: highest direct mapped pfn over 4GB * - * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are * represented by pfn_mapped */ unsigned long max_low_pfn_mapped; @@ -173,14 +173,11 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -/* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data = { - .wp_works_ok = -1, -}; +/* cpu data as detected by the assembly code in head_32.S */ +struct cpuinfo_x86 new_cpu_data; + /* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data __read_mostly = { - .wp_works_ok = -1, -}; +struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); unsigned int def_to_bigsmp; @@ -426,7 +423,7 @@ static void __init parse_setup_data(void) switch (data_type) { case SETUP_E820_EXT: - parse_e820_ext(pa_data, data_len); + e820__memory_setup_extended(pa_data, data_len); break; case SETUP_DTB: add_dtb(pa_data); @@ -441,29 +438,6 @@ static void __init parse_setup_data(void) } } -static void __init e820_reserve_setup_data(void) -{ - struct setup_data *data; - u64 pa_data; - - pa_data = boot_params.hdr.setup_data; - if (!pa_data) - return; - - while (pa_data) { - data = early_memremap(pa_data, sizeof(*data)); - e820_update_range(pa_data, sizeof(*data)+data->len, - E820_RAM, E820_RESERVED_KERN); - pa_data = data->next; - early_memunmap(data, sizeof(*data)); - } - - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); - memcpy(e820_saved, e820, sizeof(struct e820map)); - printk(KERN_INFO "extended physical RAM map:\n"); - e820_print_map("reserve setup_data"); -} - static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_data *data; @@ -756,16 +730,16 @@ static void __init trim_bios_range(void) * since some BIOSes are known to corrupt low memory. See the * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); /* * special case: Some BIOSen report the PC BIOS * area (640->1Mb) as ram even though it is not. * take them out. */ - e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__update_table(e820_table); } /* called before trim_bios_range() to spare extra sanitize */ @@ -775,18 +749,18 @@ static void __init e820_add_kernel_range(void) u64 size = __pa_symbol(_end) - start; /* - * Complain if .text .data and .bss are not marked as E820_RAM and + * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and * attempt to fix it by adding the range. We may have a confused BIOS, * or the user may have used memmap=exactmap or memmap=xxM$yyM to * exclude kernel range. If we really are running on top non-RAM, * we will crash later anyways. */ - if (e820_all_mapped(start, start + size, E820_RAM)) + if (e820__mapped_all(start, start + size, E820_TYPE_RAM)) return; - pr_warn(".text .data .bss are not marked as E820_RAM!\n"); - e820_remove_range(start, size, E820_RAM, 0); - e820_add_region(start, size, E820_RAM); + pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n"); + e820__range_remove(start, size, E820_TYPE_RAM, 0); + e820__range_add(start, size, E820_TYPE_RAM); } static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; @@ -939,7 +913,7 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; - setup_memory_map(); + e820__memory_setup(); parse_setup_data(); copy_edd(); @@ -1028,9 +1002,8 @@ void __init setup_arch(char **cmdline_p) early_dump_pci_devices(); #endif - /* update the e820_saved too */ - e820_reserve_setup_data(); - finish_e820_parsing(); + e820__reserve_setup_data(); + e820__finish_early_params(); if (efi_enabled(EFI_BOOT)) efi_init(); @@ -1056,11 +1029,11 @@ void __init setup_arch(char **cmdline_p) trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { - e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, - E820_RESERVED); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM, + E820_TYPE_RESERVED); + e820__update_table(e820_table); printk(KERN_INFO "fixed physical RAM map:\n"); - e820_print_map("bad_ppro"); + e820__print_table("bad_ppro"); } #else early_gart_iommu_check(); @@ -1070,12 +1043,12 @@ void __init setup_arch(char **cmdline_p) * partially used pages are not usable - thus * we are rounding upwards: */ - max_pfn = e820_end_of_ram_pfn(); + max_pfn = e820__end_of_ram_pfn(); /* update e820 for memory not covered by WB MTRRs */ mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - max_pfn = e820_end_of_ram_pfn(); + max_pfn = e820__end_of_ram_pfn(); max_possible_pfn = max_pfn; @@ -1094,7 +1067,7 @@ void __init setup_arch(char **cmdline_p) /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) - max_low_pfn = e820_end_of_low_ram_pfn(); + max_low_pfn = e820__end_of_low_ram_pfn(); else max_low_pfn = max_pfn; @@ -1111,7 +1084,7 @@ void __init setup_arch(char **cmdline_p) early_alloc_pgt_buf(); /* - * Need to conclude brk, before memblock_x86_fill() + * Need to conclude brk, before e820__memblock_setup() * it could use memblock_find_in_range, could overlap with * brk area. */ @@ -1120,7 +1093,7 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); memblock_set_current_limit(ISA_END_ADDRESS); - memblock_x86_fill(); + e820__memblock_setup(); reserve_bios_regions(); @@ -1137,7 +1110,7 @@ void __init setup_arch(char **cmdline_p) } /* preallocate 4k for mptable mpc */ - early_reserve_e820_mpc_new(); + e820__memblock_alloc_reserved_mpc_new(); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); @@ -1275,12 +1248,12 @@ void __init setup_arch(char **cmdline_p) kvm_guest_init(); - e820_reserve_resources(); - e820_mark_nosave_regions(max_low_pfn); + e820__reserve_resources(); + e820__register_nosave_regions(max_low_pfn); x86_init.resources.reserve_resources(); - e820_setup_gap(); + e820__setup_pci_gap(); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 396c042e9d0e..cc30a74e4adb 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -846,7 +846,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index ec1f756f9dc9..71beb28600d4 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -151,8 +151,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, if (from->si_signo == SIGSEGV) { if (from->si_code == SEGV_BNDERR) { - compat_uptr_t lower = (unsigned long)&to->si_lower; - compat_uptr_t upper = (unsigned long)&to->si_upper; + compat_uptr_t lower = (unsigned long)from->si_lower; + compat_uptr_t upper = (unsigned long)from->si_upper; put_user_ex(lower, &to->si_lower); put_user_ex(upper, &to->si_upper); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d3c66a15bbde..d798c0da451c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -33,6 +33,7 @@ #include <asm/mce.h> #include <asm/trace/irq_vectors.h> #include <asm/kexec.h> +#include <asm/virtext.h> /* * Some notes on x86 processor bugs affecting SMP operation: @@ -124,7 +125,7 @@ static bool smp_no_nmi_ipi = false; static void native_smp_send_reschedule(int cpu) { if (unlikely(cpu_is_offline(cpu))) { - WARN_ON(1); + WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); return; } apic->send_IPI(cpu, RESCHEDULE_VECTOR); @@ -162,6 +163,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; + cpu_emergency_vmxoff(); stop_this_cpu(NULL); return NMI_HANDLED; @@ -174,6 +176,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) asmlinkage __visible void smp_reboot_interrupt(void) { ipi_entering_ack_irq(); + cpu_emergency_vmxoff(); stop_this_cpu(NULL); irq_exit(); } diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b868fa1b812b..ccccd335ae01 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -42,7 +42,7 @@ #include <asm/fixmap.h> #include <asm/proto.h> #include <asm/setup.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/io.h> #include "../realmode/rm/wakeup.h" @@ -68,9 +68,9 @@ void __init tboot_probe(void) * also verify that it is mapped as we expect it before calling * set_fixmap(), to reduce chance of garbage value causing crash */ - if (!e820_any_mapped(boot_params.tboot_addr, - boot_params.tboot_addr, E820_RESERVED)) { - pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + if (!e820__mapped_any(boot_params.tboot_addr, + boot_params.tboot_addr, E820_TYPE_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); return; } @@ -188,12 +188,12 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; - for (i = 0; i < e820->nr_map; i++) { - if ((e820->map[i].type != E820_RAM) - && (e820->map[i].type != E820_RESERVED_KERN)) + for (i = 0; i < e820_table->nr_entries; i++) { + if ((e820_table->entries[i].type != E820_TYPE_RAM) + && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN)) continue; - add_mac_region(e820->map[i].addr, e820->map[i].size); + add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size); } tboot->acpi_sinfo.kernel_s3_resume_vector = diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3c0751b120de..3995d3a777d4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -289,7 +289,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, str, regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } @@ -553,7 +553,7 @@ do_general_protection(struct pt_regs *regs, long error_code) pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c73a7f9e881a..714dfba6a1e7 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -328,7 +328,7 @@ unsigned long long sched_clock(void) return paravirt_sched_clock(); } -static inline bool using_native_sched_clock(void) +bool using_native_sched_clock(void) { return pv_time_ops.sched_clock == native_sched_clock; } @@ -336,7 +336,7 @@ static inline bool using_native_sched_clock(void) unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); -static inline bool using_native_sched_clock(void) { return true; } +bool using_native_sched_clock(void) { return true; } #endif int check_tsc_unstable(void) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 11a93f005268..a088b2c47f73 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,7 +14,7 @@ #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/time.h> #include <asm/irq.h> #include <asm/io_apic.h> @@ -38,7 +38,7 @@ struct x86_init_ops x86_init __initdata = { .resources = { .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, - .memory_setup = default_machine_specific_memory_setup, + .memory_setup = e820__memory_setup_default, }, .mpparse = { |