diff options
Diffstat (limited to 'arch/x86/kernel')
67 files changed, 1556 insertions, 477 deletions
| diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index c88e0b127810..b481b95bd8f6 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -14,8 +14,11 @@  #include <asm/amd_nb.h>  #define PCI_DEVICE_ID_AMD_17H_ROOT	0x1450 +#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT	0x15d0  #define PCI_DEVICE_ID_AMD_17H_DF_F3	0x1463  #define PCI_DEVICE_ID_AMD_17H_DF_F4	0x1464 +#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb +#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec  /* Protect the PCI config register pairs used for SMN and DF indirect access. */  static DEFINE_MUTEX(smn_mutex); @@ -24,6 +27,7 @@ static u32 *flush_words;  static const struct pci_device_id amd_root_ids[] = {  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, +	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },  	{}  }; @@ -39,6 +43,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, +	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },  	{}  }; @@ -51,6 +56,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, +	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },  	{}  }; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7553819c74c3..3982f79d2377 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1851,7 +1851,7 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data)  	 * intr-remapping table entry. Hence for the io-apic  	 * EOI we use the pin number.  	 */ -	ack_APIC_irq(); +	apic_ack_irq(irq_data);  	eoi_ioapic_pin(data->entry.vector, data);  } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index bb6f7a2148d7..35aaee4fc028 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -235,6 +235,15 @@ static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest)  	if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest))  		return 0; +	/* +	 * Careful here. @apicd might either have move_in_progress set or +	 * be enqueued for cleanup. Assigning a new vector would either +	 * leave a stale vector on some CPU around or in case of a pending +	 * cleanup corrupt the hlist. +	 */ +	if (apicd->move_in_progress || !hlist_unhashed(&apicd->clist)) +		return -EBUSY; +  	vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu);  	if (vector > 0)  		apic_update_vector(irqd, vector, cpu); @@ -579,8 +588,7 @@ error:  static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,  				  struct irq_data *irqd, int ind)  { -	unsigned int cpu, vector, prev_cpu, prev_vector; -	struct apic_chip_data *apicd; +	struct apic_chip_data apicd;  	unsigned long flags;  	int irq; @@ -596,24 +604,26 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,  		return;  	} -	apicd = irqd->chip_data; -	if (!apicd) { +	if (!irqd->chip_data) {  		seq_printf(m, "%*sVector: Not assigned\n", ind, "");  		return;  	}  	raw_spin_lock_irqsave(&vector_lock, flags); -	cpu = apicd->cpu; -	vector = apicd->vector; -	prev_cpu = apicd->prev_cpu; -	prev_vector = apicd->prev_vector; +	memcpy(&apicd, irqd->chip_data, sizeof(apicd));  	raw_spin_unlock_irqrestore(&vector_lock, flags); -	seq_printf(m, "%*sVector: %5u\n", ind, "", vector); -	seq_printf(m, "%*sTarget: %5u\n", ind, "", cpu); -	if (prev_vector) { -		seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", prev_vector); -		seq_printf(m, "%*sPrevious target: %5u\n", ind, "", prev_cpu); + +	seq_printf(m, "%*sVector: %5u\n", ind, "", apicd.vector); +	seq_printf(m, "%*sTarget: %5u\n", ind, "", apicd.cpu); +	if (apicd.prev_vector) { +		seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", apicd.prev_vector); +		seq_printf(m, "%*sPrevious target: %5u\n", ind, "", apicd.prev_cpu);  	} +	seq_printf(m, "%*smove_in_progress: %u\n", ind, "", apicd.move_in_progress ? 1 : 0); +	seq_printf(m, "%*sis_managed:       %u\n", ind, "", apicd.is_managed ? 1 : 0); +	seq_printf(m, "%*scan_reserve:      %u\n", ind, "", apicd.can_reserve ? 1 : 0); +	seq_printf(m, "%*shas_reserved:     %u\n", ind, "", apicd.has_reserved ? 1 : 0); +	seq_printf(m, "%*scleanup_pending:  %u\n", ind, "", !hlist_unhashed(&apicd.clist));  }  #endif @@ -800,13 +810,18 @@ static int apic_retrigger_irq(struct irq_data *irqd)  	return 1;  } -void apic_ack_edge(struct irq_data *irqd) +void apic_ack_irq(struct irq_data *irqd)  { -	irq_complete_move(irqd_cfg(irqd));  	irq_move_irq(irqd);  	ack_APIC_irq();  } +void apic_ack_edge(struct irq_data *irqd) +{ +	irq_complete_move(irqd_cfg(irqd)); +	apic_ack_irq(irqd); +} +  static struct irq_chip lapic_controller = {  	.name			= "APIC",  	.irq_ack		= apic_ack_edge, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8b04234e010b..7685444a106b 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -116,6 +116,7 @@ static void init_x2apic_ldr(void)  			goto update;  	}  	cmsk = cluster_hotplug_mask; +	cmsk->clusterid = cluster;  	cluster_hotplug_mask = NULL;  update:  	this_cpu_write(cluster_masks, cmsk); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index efaf2d4f9c3c..d492752f79e1 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -26,6 +26,7 @@  #include <linux/delay.h>  #include <linux/crash_dump.h>  #include <linux/reboot.h> +#include <linux/memory.h>  #include <asm/uv/uv_mmrs.h>  #include <asm/uv/uv_hub.h> @@ -392,6 +393,51 @@ extern int uv_hub_info_version(void)  }  EXPORT_SYMBOL(uv_hub_info_version); +/* Default UV memory block size is 2GB */ +static unsigned long mem_block_size = (2UL << 30); + +/* Kernel parameter to specify UV mem block size */ +static int parse_mem_block_size(char *ptr) +{ +	unsigned long size = memparse(ptr, NULL); + +	/* Size will be rounded down by set_block_size() below */ +	mem_block_size = size; +	return 0; +} +early_param("uv_memblksize", parse_mem_block_size); + +static __init int adj_blksize(u32 lgre) +{ +	unsigned long base = (unsigned long)lgre << UV_GAM_RANGE_SHFT; +	unsigned long size; + +	for (size = mem_block_size; size > MIN_MEMORY_BLOCK_SIZE; size >>= 1) +		if (IS_ALIGNED(base, size)) +			break; + +	if (size >= mem_block_size) +		return 0; + +	mem_block_size = size; +	return 1; +} + +static __init void set_block_size(void) +{ +	unsigned int order = ffs(mem_block_size); + +	if (order) { +		/* adjust for ffs return of 1..64 */ +		set_memory_block_size_order(order - 1); +		pr_info("UV: mem_block_size set to 0x%lx\n", mem_block_size); +	} else { +		/* bad or zero value, default to 1UL << 31 (2GB) */ +		pr_err("UV: mem_block_size error with 0x%lx\n", mem_block_size); +		set_memory_block_size_order(31); +	} +} +  /* Build GAM range lookup table: */  static __init void build_uv_gr_table(void)  { @@ -1180,23 +1226,30 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)  					<< UV_GAM_RANGE_SHFT);  		int order = 0;  		char suffix[] = " KMGTPE"; +		int flag = ' ';  		while (size > 9999 && order < sizeof(suffix)) {  			size /= 1024;  			order++;  		} +		/* adjust max block size to current range start */ +		if (gre->type == 1 || gre->type == 2) +			if (adj_blksize(lgre)) +				flag = '*'; +  		if (!index) {  			pr_info("UV: GAM Range Table...\n"); -			pr_info("UV:  # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN"); +			pr_info("UV:  # %20s %14s %6s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");  		} -		pr_info("UV: %2d: 0x%014lx-0x%014lx %5lu%c %3d   %04x  %02x %02x\n", +		pr_info("UV: %2d: 0x%014lx-0x%014lx%c %5lu%c %3d   %04x  %02x %02x\n",  			index++,  			(unsigned long)lgre << UV_GAM_RANGE_SHFT,  			(unsigned long)gre->limit << UV_GAM_RANGE_SHFT, -			size, suffix[order], +			flag, size, suffix[order],  			gre->type, gre->nasid, gre->sockid, gre->pnode); +		/* update to next range start */  		lgre = gre->limit;  		if (sock_min > gre->sockid)  			sock_min = gre->sockid; @@ -1427,6 +1480,7 @@ static void __init uv_system_init_hub(void)  	build_socket_tables();  	build_uv_gr_table(); +	set_block_size();  	uv_init_hub_info(&hub_info);  	uv_possible_blades = num_possible_nodes();  	if (!_node_to_pnode) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index dfcbe6924eaf..5d0de79fdab0 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1715,19 +1715,6 @@ static int proc_apm_show(struct seq_file *m, void *v)  	return 0;  } -static int proc_apm_open(struct inode *inode, struct file *file) -{ -	return single_open(file, proc_apm_show, NULL); -} - -static const struct file_operations apm_file_ops = { -	.owner		= THIS_MODULE, -	.open		= proc_apm_open, -	.read		= seq_read, -	.llseek		= seq_lseek, -	.release	= single_release, -}; -  static int apm(void *unused)  {  	unsigned short	bx; @@ -2360,7 +2347,7 @@ static int __init apm_init(void)  	set_desc_base(&gdt[APM_DS >> 3],  		 (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); -	proc_create("apm", 0, NULL, &apm_file_ops); +	proc_create_single("apm", 0, NULL, proc_apm_show);  	kapmd_task = kthread_create(apm, NULL, "kapmd");  	if (IS_ERR(kapmd_task)) { @@ -2446,7 +2433,7 @@ MODULE_PARM_DESC(idle_threshold,  	"System idle percentage above which to make APM BIOS idle calls");  module_param(idle_period, int, 0444);  MODULE_PARM_DESC(idle_period, -	"Period (in sec/100) over which to caculate the idle percentage"); +	"Period (in sec/100) over which to calculate the idle percentage");  module_param(smp, bool, 0444);  MODULE_PARM_DESC(smp,  	"Set this to enable APM use on an SMP platform. Use with caution on older systems"); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 76417a9aab73..dcb008c320fe 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -32,7 +32,7 @@  void common(void) {  	BLANK();  	OFFSET(TASK_threadsp, task_struct, thread.sp); -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR  	OFFSET(TASK_stack_canary, task_struct, stack_canary);  #endif diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index f91ba53e06c8..a4a3be399f4b 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -50,7 +50,7 @@ void foo(void)  	DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -  	       offsetofend(struct cpu_entry_area, entry_stack_page.stack)); -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR  	BLANK();  	OFFSET(stack_canary_offset, stack_canary, canary);  #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bf51e51d808d..b2dcd161f514 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -69,7 +69,7 @@ int main(void)  	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);  	BLANK(); -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR  	DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));  	BLANK();  #endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index a66229f51b12..7a40196967cb 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,7 +17,7 @@ KCOV_INSTRUMENT_perf_event.o := n  nostackp := $(call cc-option, -fno-stack-protector)  CFLAGS_common.o		:= $(nostackp) -obj-y			:= intel_cacheinfo.o scattered.o topology.o +obj-y			:= cacheinfo.o scattered.o topology.o  obj-y			+= common.o  obj-y			+= rdrand.o  obj-y			+= match.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 12bc0a1139da..082d7875cef8 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -9,7 +9,9 @@  #include <linux/random.h>  #include <asm/processor.h>  #include <asm/apic.h> +#include <asm/cacheinfo.h>  #include <asm/cpu.h> +#include <asm/spec-ctrl.h>  #include <asm/smp.h>  #include <asm/pci-direct.h>  #include <asm/delay.h> @@ -297,7 +299,6 @@ static int nearby_node(int apicid)  }  #endif -#ifdef CONFIG_SMP  /*   * Fix up cpu_core_id for pre-F17h systems to be in the   * [0 .. cores_per_node - 1] range. Not really needed but @@ -327,6 +328,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)  	/* get information required for multi-node processors */  	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { +		int err;  		u32 eax, ebx, ecx, edx;  		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); @@ -345,21 +347,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c)  		}  		/* -		 * We may have multiple LLCs if L3 caches exist, so check if we -		 * have an L3 cache by looking at the L3 cache CPUID leaf. +		 * In case leaf B is available, use it to derive +		 * topology information.  		 */ -		if (cpuid_edx(0x80000006)) { -			if (c->x86 == 0x17) { -				/* -				 * LLC is at the core complex level. -				 * Core complex id is ApicId[3]. -				 */ -				per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; -			} else { -				/* LLC is at the node level. */ -				per_cpu(cpu_llc_id, cpu) = node_id; -			} -		} +		err = detect_extended_topology(c); +		if (!err) +			c->x86_coreid_bits = get_count_order(c->x86_max_cores); + +		cacheinfo_amd_init_llc_id(c, cpu, node_id); +  	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {  		u64 value; @@ -375,7 +371,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)  		legacy_fixup_core_id(c);  	}  } -#endif  /*   * On a AMD dual core setup the lower bits of the APIC id distinguish the cores. @@ -383,7 +378,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)   */  static void amd_detect_cmp(struct cpuinfo_x86 *c)  { -#ifdef CONFIG_SMP  	unsigned bits;  	int cpu = smp_processor_id(); @@ -394,17 +388,11 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)  	c->phys_proc_id = c->initial_apicid >> bits;  	/* use socket ID also for last level cache */  	per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; -	amd_get_topology(c); -#endif  }  u16 amd_get_nb_id(int cpu)  { -	u16 id = 0; -#ifdef CONFIG_SMP -	id = per_cpu(cpu_llc_id, cpu); -#endif -	return id; +	return per_cpu(cpu_llc_id, cpu);  }  EXPORT_SYMBOL_GPL(amd_get_nb_id); @@ -554,6 +542,26 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)  		rdmsrl(MSR_FAM10H_NODE_ID, value);  		nodes_per_socket = ((value >> 3) & 7) + 1;  	} + +	if (c->x86 >= 0x15 && c->x86 <= 0x17) { +		unsigned int bit; + +		switch (c->x86) { +		case 0x15: bit = 54; break; +		case 0x16: bit = 33; break; +		case 0x17: bit = 10; break; +		default: return; +		} +		/* +		 * Try to cache the base value so further operations can +		 * avoid RMW. If that faults, do not enable SSBD. +		 */ +		if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { +			setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); +			setup_force_cpu_cap(X86_FEATURE_SSBD); +			x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; +		} +	}  }  static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) @@ -791,6 +799,7 @@ static void init_amd_bd(struct cpuinfo_x86 *c)  static void init_amd_zn(struct cpuinfo_x86 *c)  { +	set_cpu_cap(c, X86_FEATURE_ZEN);  	/*  	 * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects  	 * all up to and including B1. @@ -842,6 +851,7 @@ static void init_amd(struct cpuinfo_x86 *c)  	/* Multi core CPU? */  	if (c->extended_cpuid_level >= 0x80000008) {  		amd_detect_cmp(c); +		amd_get_topology(c);  		srat_detect_node(c);  	} diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bfca937bdcc3..404df26b7de8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -12,8 +12,10 @@  #include <linux/utsname.h>  #include <linux/cpu.h>  #include <linux/module.h> +#include <linux/nospec.h> +#include <linux/prctl.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h>  #include <asm/cmdline.h>  #include <asm/bugs.h>  #include <asm/processor.h> @@ -25,8 +27,30 @@  #include <asm/pgtable.h>  #include <asm/set_memory.h>  #include <asm/intel-family.h> +#include <asm/hypervisor.h>  static void __init spectre_v2_select_mitigation(void); +static void __init ssb_select_mitigation(void); + +/* + * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any + * writes to SPEC_CTRL contain whatever reserved bits have been set. + */ +u64 __ro_after_init x86_spec_ctrl_base; +EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* + * The vendor and possibly platform specific bits which can be modified in + * x86_spec_ctrl_base. + */ +static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; + +/* + * AMD specific MSR info for Speculative Store Bypass control. + * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu(). + */ +u64 __ro_after_init x86_amd_ls_cfg_base; +u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;  void __init check_bugs(void)  { @@ -37,9 +61,27 @@ void __init check_bugs(void)  		print_cpu_info(&boot_cpu_data);  	} +	/* +	 * Read the SPEC_CTRL MSR to account for reserved bits which may +	 * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD +	 * init code as it is not enumerated and depends on the family. +	 */ +	if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) +		rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + +	/* Allow STIBP in MSR_SPEC_CTRL if supported */ +	if (boot_cpu_has(X86_FEATURE_STIBP)) +		x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; +  	/* Select the proper spectre mitigation before patching alternatives */  	spectre_v2_select_mitigation(); +	/* +	 * Select proper mitigation for any exposure to the Speculative Store +	 * Bypass vulnerability. +	 */ +	ssb_select_mitigation(); +  #ifdef CONFIG_X86_32  	/*  	 * Check whether we are able to run this kernel safely on SMP. @@ -93,7 +135,76 @@ static const char *spectre_v2_strings[] = {  #undef pr_fmt  #define pr_fmt(fmt)     "Spectre V2 : " fmt -static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; +static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = +	SPECTRE_V2_NONE; + +void +x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) +{ +	u64 msrval, guestval, hostval = x86_spec_ctrl_base; +	struct thread_info *ti = current_thread_info(); + +	/* Is MSR_SPEC_CTRL implemented ? */ +	if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { +		/* +		 * Restrict guest_spec_ctrl to supported values. Clear the +		 * modifiable bits in the host base value and or the +		 * modifiable bits from the guest value. +		 */ +		guestval = hostval & ~x86_spec_ctrl_mask; +		guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; + +		/* SSBD controlled in MSR_SPEC_CTRL */ +		if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) +			hostval |= ssbd_tif_to_spec_ctrl(ti->flags); + +		if (hostval != guestval) { +			msrval = setguest ? guestval : hostval; +			wrmsrl(MSR_IA32_SPEC_CTRL, msrval); +		} +	} + +	/* +	 * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update +	 * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. +	 */ +	if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) && +	    !static_cpu_has(X86_FEATURE_VIRT_SSBD)) +		return; + +	/* +	 * If the host has SSBD mitigation enabled, force it in the host's +	 * virtual MSR value. If its not permanently enabled, evaluate +	 * current's TIF_SSBD thread flag. +	 */ +	if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE)) +		hostval = SPEC_CTRL_SSBD; +	else +		hostval = ssbd_tif_to_spec_ctrl(ti->flags); + +	/* Sanitize the guest value */ +	guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD; + +	if (hostval != guestval) { +		unsigned long tif; + +		tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : +				 ssbd_spec_ctrl_to_tif(hostval); + +		speculative_store_bypass_update(tif); +	} +} +EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); + +static void x86_amd_ssb_disable(void) +{ +	u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; + +	if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) +		wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); +	else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) +		wrmsrl(MSR_AMD64_LS_CFG, msrval); +}  #ifdef RETPOLINE  static bool spectre_v2_bad_module; @@ -312,32 +423,289 @@ retpoline_auto:  }  #undef pr_fmt +#define pr_fmt(fmt)	"Speculative Store Bypass: " fmt + +static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; + +/* The kernel command line selection */ +enum ssb_mitigation_cmd { +	SPEC_STORE_BYPASS_CMD_NONE, +	SPEC_STORE_BYPASS_CMD_AUTO, +	SPEC_STORE_BYPASS_CMD_ON, +	SPEC_STORE_BYPASS_CMD_PRCTL, +	SPEC_STORE_BYPASS_CMD_SECCOMP, +}; + +static const char *ssb_strings[] = { +	[SPEC_STORE_BYPASS_NONE]	= "Vulnerable", +	[SPEC_STORE_BYPASS_DISABLE]	= "Mitigation: Speculative Store Bypass disabled", +	[SPEC_STORE_BYPASS_PRCTL]	= "Mitigation: Speculative Store Bypass disabled via prctl", +	[SPEC_STORE_BYPASS_SECCOMP]	= "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", +}; + +static const struct { +	const char *option; +	enum ssb_mitigation_cmd cmd; +} ssb_mitigation_options[] = { +	{ "auto",	SPEC_STORE_BYPASS_CMD_AUTO },    /* Platform decides */ +	{ "on",		SPEC_STORE_BYPASS_CMD_ON },      /* Disable Speculative Store Bypass */ +	{ "off",	SPEC_STORE_BYPASS_CMD_NONE },    /* Don't touch Speculative Store Bypass */ +	{ "prctl",	SPEC_STORE_BYPASS_CMD_PRCTL },   /* Disable Speculative Store Bypass via prctl */ +	{ "seccomp",	SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ +}; + +static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +{ +	enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; +	char arg[20]; +	int ret, i; + +	if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { +		return SPEC_STORE_BYPASS_CMD_NONE; +	} else { +		ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", +					  arg, sizeof(arg)); +		if (ret < 0) +			return SPEC_STORE_BYPASS_CMD_AUTO; + +		for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { +			if (!match_option(arg, ret, ssb_mitigation_options[i].option)) +				continue; + +			cmd = ssb_mitigation_options[i].cmd; +			break; +		} + +		if (i >= ARRAY_SIZE(ssb_mitigation_options)) { +			pr_err("unknown option (%s). Switching to AUTO select\n", arg); +			return SPEC_STORE_BYPASS_CMD_AUTO; +		} +	} + +	return cmd; +} + +static enum ssb_mitigation __init __ssb_select_mitigation(void) +{ +	enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; +	enum ssb_mitigation_cmd cmd; + +	if (!boot_cpu_has(X86_FEATURE_SSBD)) +		return mode; + +	cmd = ssb_parse_cmdline(); +	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && +	    (cmd == SPEC_STORE_BYPASS_CMD_NONE || +	     cmd == SPEC_STORE_BYPASS_CMD_AUTO)) +		return mode; + +	switch (cmd) { +	case SPEC_STORE_BYPASS_CMD_AUTO: +	case SPEC_STORE_BYPASS_CMD_SECCOMP: +		/* +		 * Choose prctl+seccomp as the default mode if seccomp is +		 * enabled. +		 */ +		if (IS_ENABLED(CONFIG_SECCOMP)) +			mode = SPEC_STORE_BYPASS_SECCOMP; +		else +			mode = SPEC_STORE_BYPASS_PRCTL; +		break; +	case SPEC_STORE_BYPASS_CMD_ON: +		mode = SPEC_STORE_BYPASS_DISABLE; +		break; +	case SPEC_STORE_BYPASS_CMD_PRCTL: +		mode = SPEC_STORE_BYPASS_PRCTL; +		break; +	case SPEC_STORE_BYPASS_CMD_NONE: +		break; +	} + +	/* +	 * We have three CPU feature flags that are in play here: +	 *  - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. +	 *  - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass +	 *  - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation +	 */ +	if (mode == SPEC_STORE_BYPASS_DISABLE) { +		setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); +		/* +		 * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may +		 * use a completely different MSR and bit dependent on family. +		 */ +		if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) +			x86_amd_ssb_disable(); +		else { +			x86_spec_ctrl_base |= SPEC_CTRL_SSBD; +			x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; +			wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); +		} +	} + +	return mode; +} + +static void ssb_select_mitigation(void) +{ +	ssb_mode = __ssb_select_mitigation(); + +	if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) +		pr_info("%s\n", ssb_strings[ssb_mode]); +} + +#undef pr_fmt +#define pr_fmt(fmt)     "Speculation prctl: " fmt + +static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) +{ +	bool update; + +	if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && +	    ssb_mode != SPEC_STORE_BYPASS_SECCOMP) +		return -ENXIO; + +	switch (ctrl) { +	case PR_SPEC_ENABLE: +		/* If speculation is force disabled, enable is not allowed */ +		if (task_spec_ssb_force_disable(task)) +			return -EPERM; +		task_clear_spec_ssb_disable(task); +		update = test_and_clear_tsk_thread_flag(task, TIF_SSBD); +		break; +	case PR_SPEC_DISABLE: +		task_set_spec_ssb_disable(task); +		update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); +		break; +	case PR_SPEC_FORCE_DISABLE: +		task_set_spec_ssb_disable(task); +		task_set_spec_ssb_force_disable(task); +		update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); +		break; +	default: +		return -ERANGE; +	} + +	/* +	 * If being set on non-current task, delay setting the CPU +	 * mitigation until it is next scheduled. +	 */ +	if (task == current && update) +		speculative_store_bypass_update_current(); + +	return 0; +} + +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, +			     unsigned long ctrl) +{ +	switch (which) { +	case PR_SPEC_STORE_BYPASS: +		return ssb_prctl_set(task, ctrl); +	default: +		return -ENODEV; +	} +} + +#ifdef CONFIG_SECCOMP +void arch_seccomp_spec_mitigate(struct task_struct *task) +{ +	if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) +		ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); +} +#endif + +static int ssb_prctl_get(struct task_struct *task) +{ +	switch (ssb_mode) { +	case SPEC_STORE_BYPASS_DISABLE: +		return PR_SPEC_DISABLE; +	case SPEC_STORE_BYPASS_SECCOMP: +	case SPEC_STORE_BYPASS_PRCTL: +		if (task_spec_ssb_force_disable(task)) +			return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; +		if (task_spec_ssb_disable(task)) +			return PR_SPEC_PRCTL | PR_SPEC_DISABLE; +		return PR_SPEC_PRCTL | PR_SPEC_ENABLE; +	default: +		if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) +			return PR_SPEC_ENABLE; +		return PR_SPEC_NOT_AFFECTED; +	} +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ +	switch (which) { +	case PR_SPEC_STORE_BYPASS: +		return ssb_prctl_get(task); +	default: +		return -ENODEV; +	} +} + +void x86_spec_ctrl_setup_ap(void) +{ +	if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) +		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + +	if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) +		x86_amd_ssb_disable(); +}  #ifdef CONFIG_SYSFS -ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) + +static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, +			       char *buf, unsigned int bug)  { -	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) +	if (!boot_cpu_has_bug(bug))  		return sprintf(buf, "Not affected\n"); -	if (boot_cpu_has(X86_FEATURE_PTI)) -		return sprintf(buf, "Mitigation: PTI\n"); + +	switch (bug) { +	case X86_BUG_CPU_MELTDOWN: +		if (boot_cpu_has(X86_FEATURE_PTI)) +			return sprintf(buf, "Mitigation: PTI\n"); + +		if (hypervisor_is_type(X86_HYPER_XEN_PV)) +			return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); + +		break; + +	case X86_BUG_SPECTRE_V1: +		return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + +	case X86_BUG_SPECTRE_V2: +		return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], +			       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", +			       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", +			       spectre_v2_module_string()); + +	case X86_BUG_SPEC_STORE_BYPASS: +		return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + +	default: +		break; +	} +  	return sprintf(buf, "Vulnerable\n");  } +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) +{ +	return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN); +} +  ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)  { -	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) -		return sprintf(buf, "Not affected\n"); -	return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +	return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1);  }  ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)  { -	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) -		return sprintf(buf, "Not affected\n"); +	return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2); +} -	return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], -		       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", -		       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", -		       spectre_v2_module_string()); +ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf) +{ +	return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);  }  #endif diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 54d04d574148..0c5fcbd998cf 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -20,6 +20,8 @@  #include <asm/amd_nb.h>  #include <asm/smp.h> +#include "cpu.h" +  #define LVL_1_INST	1  #define LVL_1_DATA	2  #define LVL_2		3 @@ -637,6 +639,45 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)  	return i;  } +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) +{ +	/* +	 * We may have multiple LLCs if L3 caches exist, so check if we +	 * have an L3 cache by looking at the L3 cache CPUID leaf. +	 */ +	if (!cpuid_edx(0x80000006)) +		return; + +	if (c->x86 < 0x17) { +		/* LLC is at the node level. */ +		per_cpu(cpu_llc_id, cpu) = node_id; +	} else if (c->x86 == 0x17 && +		   c->x86_model >= 0 && c->x86_model <= 0x1F) { +		/* +		 * LLC is at the core complex level. +		 * Core complex ID is ApicId[3] for these processors. +		 */ +		per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; +	} else { +		/* +		 * LLC ID is calculated from the number of threads sharing the +		 * cache. +		 * */ +		u32 eax, ebx, ecx, edx, num_sharing_cache = 0; +		u32 llc_index = find_num_cache_leaves(c) - 1; + +		cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); +		if (eax) +			num_sharing_cache = ((eax >> 14) & 0xfff) + 1; + +		if (num_sharing_cache) { +			int bits = get_count_order(num_sharing_cache); + +			per_cpu(cpu_llc_id, cpu) = c->apicid >> bits; +		} +	} +} +  void init_amd_cacheinfo(struct cpuinfo_x86 *c)  { @@ -650,7 +691,7 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c)  	}  } -unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) +void init_intel_cacheinfo(struct cpuinfo_x86 *c)  {  	/* Cache sizes */  	unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; @@ -802,7 +843,8 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)  	c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); -	return l2; +	if (!l2) +		cpu_detect_cache_sizes(c);  }  static int __cache_amd_cpumap_setup(unsigned int cpu, int index, diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e5ec0f11c0de..14433ff5b828 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -18,6 +18,13 @@  #define RNG_ENABLED	(1 << 3)  #define RNG_ENABLE	(1 << 6)	/* MSR_VIA_RNG */ +#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW	0x00200000 +#define X86_VMX_FEATURE_PROC_CTLS_VNMI		0x00400000 +#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS	0x80000000 +#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC	0x00000001 +#define X86_VMX_FEATURE_PROC_CTLS2_EPT		0x00000002 +#define X86_VMX_FEATURE_PROC_CTLS2_VPID		0x00000020 +  static void init_c3(struct cpuinfo_x86 *c)  {  	u32  lo, hi; @@ -112,6 +119,31 @@ static void early_init_centaur(struct cpuinfo_x86 *c)  	}  } +static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c) +{ +	u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + +	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); +	msr_ctl = vmx_msr_high | vmx_msr_low; + +	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) +		set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); +	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) +		set_cpu_cap(c, X86_FEATURE_VNMI); +	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { +		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, +		      vmx_msr_low, vmx_msr_high); +		msr_ctl2 = vmx_msr_high | vmx_msr_low; +		if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && +		    (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) +			set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); +		if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) +			set_cpu_cap(c, X86_FEATURE_EPT); +		if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) +			set_cpu_cap(c, X86_FEATURE_VPID); +	} +} +  static void init_centaur(struct cpuinfo_x86 *c)  {  #ifdef CONFIG_X86_32 @@ -128,6 +160,24 @@ static void init_centaur(struct cpuinfo_x86 *c)  	clear_cpu_cap(c, 0*32+31);  #endif  	early_init_centaur(c); +	init_intel_cacheinfo(c); +	detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 +	detect_ht(c); +#endif + +	if (c->cpuid_level > 9) { +		unsigned int eax = cpuid_eax(10); + +		/* +		 * Check for version and the number of counters +		 * Version(eax[7:0]) can't be 0; +		 * Counters(eax[15:8]) should be greater than 1; +		 */ +		if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1)) +			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); +	} +  	switch (c->x86) {  #ifdef CONFIG_X86_32  	case 5: @@ -199,6 +249,9 @@ static void init_centaur(struct cpuinfo_x86 *c)  #ifdef CONFIG_X86_64  	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);  #endif + +	if (cpu_has(c, X86_FEATURE_VMX)) +		centaur_detect_vmx_virtcap(c);  }  #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8a5b185735e1..eb4cb3efd20e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1,3 +1,6 @@ +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 +  #include <linux/bootmem.h>  #include <linux/linkage.h>  #include <linux/bitops.h> @@ -66,6 +69,13 @@ cpumask_var_t cpu_callin_mask;  /* representing cpus for which sibling maps can be computed */  cpumask_var_t cpu_sibling_setup_mask; +/* Number of siblings per CPU package */ +int smp_num_siblings = 1; +EXPORT_SYMBOL(smp_num_siblings); + +/* Last level cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; +  /* correctly size the local cpu masks */  void __init setup_cpu_local_masks(void)  { @@ -577,6 +587,19 @@ static void get_model_name(struct cpuinfo_x86 *c)  	*(s + 1) = '\0';  } +void detect_num_cpu_cores(struct cpuinfo_x86 *c) +{ +	unsigned int eax, ebx, ecx, edx; + +	c->x86_max_cores = 1; +	if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) +		return; + +	cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); +	if (eax & 0x1f) +		c->x86_max_cores = (eax >> 26) + 1; +} +  void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)  {  	unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -757,17 +780,38 @@ static void init_speculation_control(struct cpuinfo_x86 *c)  	 * and they also have a different bit for STIBP support. Also,  	 * a hypervisor might have set the individual AMD bits even on  	 * Intel CPUs, for finer-grained selection of what's available. -	 * -	 * We use the AMD bits in 0x8000_0008 EBX as the generic hardware -	 * features, which are visible in /proc/cpuinfo and used by the -	 * kernel. So set those accordingly from the Intel bits.  	 */  	if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {  		set_cpu_cap(c, X86_FEATURE_IBRS);  		set_cpu_cap(c, X86_FEATURE_IBPB); +		set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);  	} +  	if (cpu_has(c, X86_FEATURE_INTEL_STIBP))  		set_cpu_cap(c, X86_FEATURE_STIBP); + +	if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) || +	    cpu_has(c, X86_FEATURE_VIRT_SSBD)) +		set_cpu_cap(c, X86_FEATURE_SSBD); + +	if (cpu_has(c, X86_FEATURE_AMD_IBRS)) { +		set_cpu_cap(c, X86_FEATURE_IBRS); +		set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); +	} + +	if (cpu_has(c, X86_FEATURE_AMD_IBPB)) +		set_cpu_cap(c, X86_FEATURE_IBPB); + +	if (cpu_has(c, X86_FEATURE_AMD_STIBP)) { +		set_cpu_cap(c, X86_FEATURE_STIBP); +		set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); +	} + +	if (cpu_has(c, X86_FEATURE_AMD_SSBD)) { +		set_cpu_cap(c, X86_FEATURE_SSBD); +		set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); +		clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD); +	}  }  void get_cpu_cap(struct cpuinfo_x86 *c) @@ -848,6 +892,11 @@ void get_cpu_cap(struct cpuinfo_x86 *c)  		c->x86_power = edx;  	} +	if (c->extended_cpuid_level >= 0x80000008) { +		cpuid(0x80000008, &eax, &ebx, &ecx, &edx); +		c->x86_capability[CPUID_8000_0008_EBX] = ebx; +	} +  	if (c->extended_cpuid_level >= 0x8000000a)  		c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); @@ -871,7 +920,6 @@ static void get_cpu_address_sizes(struct cpuinfo_x86 *c)  		c->x86_virt_bits = (eax >> 8) & 0xff;  		c->x86_phys_bits = eax & 0xff; -		c->x86_capability[CPUID_8000_0008_EBX] = ebx;  	}  #ifdef CONFIG_X86_32  	else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) @@ -923,21 +971,48 @@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {  	{}  }; -static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c) +/* Only list CPUs which speculate but are non susceptible to SSB */ +static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT1	}, +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		}, +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT2	}, +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_MERRIFIELD	}, +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_CORE_YONAH		}, +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		}, +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		}, +	{ X86_VENDOR_AMD,	0x12,					}, +	{ X86_VENDOR_AMD,	0x11,					}, +	{ X86_VENDOR_AMD,	0x10,					}, +	{ X86_VENDOR_AMD,	0xf,					}, +	{} +}; + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)  {  	u64 ia32_cap = 0; -	if (x86_match_cpu(cpu_no_meltdown)) -		return false; +	if (x86_match_cpu(cpu_no_speculation)) +		return; + +	setup_force_cpu_bug(X86_BUG_SPECTRE_V1); +	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);  	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))  		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); +	if (!x86_match_cpu(cpu_no_spec_store_bypass) && +	   !(ia32_cap & ARCH_CAP_SSB_NO) && +	   !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) +		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + +	if (x86_match_cpu(cpu_no_meltdown)) +		return; +  	/* Rogue Data Cache Load? No! */  	if (ia32_cap & ARCH_CAP_RDCL_NO) -		return false; +		return; -	return true; +	setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);  }  /* @@ -988,12 +1063,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)  	setup_force_cpu_cap(X86_FEATURE_ALWAYS); -	if (!x86_match_cpu(cpu_no_speculation)) { -		if (cpu_vulnerable_to_meltdown(c)) -			setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); -		setup_force_cpu_bug(X86_BUG_SPECTRE_V1); -		setup_force_cpu_bug(X86_BUG_SPECTRE_V2); -	} +	cpu_set_bug_bits(c);  	fpu__init_system(c); @@ -1004,6 +1074,21 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)  	 */  	setup_clear_cpu_cap(X86_FEATURE_PCID);  #endif + +	/* +	 * Later in the boot process pgtable_l5_enabled() relies on +	 * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not +	 * enabled by this point we need to clear the feature bit to avoid +	 * false-positives at the later stage. +	 * +	 * pgtable_l5_enabled() can be false here for several reasons: +	 *  - 5-level paging is disabled compile-time; +	 *  - it's 32-bit kernel; +	 *  - machine doesn't support 5-level paging; +	 *  - user specified 'no5lvl' in kernel command line. +	 */ +	if (!pgtable_l5_enabled()) +		setup_clear_cpu_cap(X86_FEATURE_LA57);  }  void __init early_cpu_init(void) @@ -1355,6 +1440,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)  #endif  	mtrr_ap_init();  	validate_apic_and_package_id(c); +	x86_spec_ctrl_setup_ap();  }  static __init int setup_noclflush(char *arg) @@ -1516,7 +1602,7 @@ DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =  	(unsigned long)&init_thread_union + THREAD_SIZE;  EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR  DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);  #endif diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index e806b11a99af..38216f678fc3 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -47,7 +47,19 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],  extern void get_cpu_cap(struct cpuinfo_x86 *c);  extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern u32 get_scattered_cpuid_leaf(unsigned int level, +				    unsigned int sub_leaf, +				    enum cpuid_regs_idx reg); +extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); + +extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); +extern int detect_extended_topology(struct cpuinfo_x86 *c); +extern void detect_ht(struct cpuinfo_x86 *c);  unsigned int aperfmperf_get_khz(int cpu); +extern void x86_spec_ctrl_setup_ap(void); +  #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 60d1897041da..eb75564f2d25 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -188,7 +188,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)  		setup_clear_cpu_cap(X86_FEATURE_IBPB);  		setup_clear_cpu_cap(X86_FEATURE_STIBP);  		setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL); +		setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL);  		setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP); +		setup_clear_cpu_cap(X86_FEATURE_SSBD); +		setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD);  	}  	/* @@ -453,24 +456,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c)  #endif  } -/* - * find out the number of processor cores on the die - */ -static int intel_num_cpu_cores(struct cpuinfo_x86 *c) -{ -	unsigned int eax, ebx, ecx, edx; - -	if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) -		return 1; - -	/* Intel has a non-standard dependency on %ecx for this CPUID level. */ -	cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); -	if (eax & 0x1f) -		return (eax >> 26) + 1; -	else -		return 1; -} -  static void detect_vmx_virtcap(struct cpuinfo_x86 *c)  {  	/* Intel VMX MSR indicated features */ @@ -653,8 +638,6 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)  static void init_intel(struct cpuinfo_x86 *c)  { -	unsigned int l2 = 0; -  	early_init_intel(c);  	intel_workarounds(c); @@ -671,19 +654,13 @@ static void init_intel(struct cpuinfo_x86 *c)  		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology  		 * detection.  		 */ -		c->x86_max_cores = intel_num_cpu_cores(c); +		detect_num_cpu_cores(c);  #ifdef CONFIG_X86_32  		detect_ht(c);  #endif  	} -	l2 = init_intel_cacheinfo(c); - -	/* Detect legacy cache sizes if init_intel_cacheinfo did not */ -	if (l2 == 0) { -		cpu_detect_cache_sizes(c); -		l2 = c->x86_cache_size; -	} +	init_intel_cacheinfo(c);  	if (c->cpuid_level > 9) {  		unsigned eax = cpuid_eax(10); @@ -696,7 +673,8 @@ static void init_intel(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);  	if (boot_cpu_has(X86_FEATURE_DS)) { -		unsigned int l1; +		unsigned int l1, l2; +  		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);  		if (!(l1 & (1<<11)))  			set_cpu_cap(c, X86_FEATURE_BTS); @@ -724,6 +702,7 @@ static void init_intel(struct cpuinfo_x86 *c)  	 * Dixon is NOT a Celeron.  	 */  	if (c->x86 == 6) { +		unsigned int l2 = c->x86_cache_size;  		char *p = NULL;  		switch (c->x86_model) { diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 589b948e6e01..ec4754f81cbd 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -33,8 +33,8 @@  #include <asm/intel_rdt_sched.h>  #include "intel_rdt.h" -#define MAX_MBA_BW	100u  #define MBA_IS_LINEAR	0x4 +#define MBA_MAX_MBPS	U32_MAX  /* Mutex to protect rdtgroup access. */  DEFINE_MUTEX(rdtgroup_mutex); @@ -178,7 +178,7 @@ struct rdt_resource rdt_resources_all[] = {  		.msr_update		= mba_wrmsr,  		.cache_level		= 3,  		.parse_ctrlval		= parse_bw, -		.format_str		= "%d=%*d", +		.format_str		= "%d=%*u",  		.fflags			= RFTYPE_RES_MB,  	},  }; @@ -230,6 +230,14 @@ static inline void cache_alloc_hsw_probe(void)  	rdt_alloc_capable = true;  } +bool is_mba_sc(struct rdt_resource *r) +{ +	if (!r) +		return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc; + +	return r->membw.mba_sc; +} +  /*   * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values   * exposed to user interface and the h/w understandable delay values. @@ -341,7 +349,7 @@ static int get_cache_id(int cpu, int level)   * that can be written to QOS_MSRs.   * There are currently no SKUs which support non linear delay values.   */ -static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) +u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)  {  	if (r->membw.delay_linear)  		return MAX_MBA_BW - bw; @@ -431,25 +439,40 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,  	return NULL;  } +void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) +{ +	int i; + +	/* +	 * Initialize the Control MSRs to having no control. +	 * For Cache Allocation: Set all bits in cbm +	 * For Memory Allocation: Set b/w requested to 100% +	 * and the bandwidth in MBps to U32_MAX +	 */ +	for (i = 0; i < r->num_closid; i++, dc++, dm++) { +		*dc = r->default_ctrl; +		*dm = MBA_MAX_MBPS; +	} +} +  static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)  {  	struct msr_param m; -	u32 *dc; -	int i; +	u32 *dc, *dm;  	dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);  	if (!dc)  		return -ENOMEM; -	d->ctrl_val = dc; +	dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL); +	if (!dm) { +		kfree(dc); +		return -ENOMEM; +	} -	/* -	 * Initialize the Control MSRs to having no control. -	 * For Cache Allocation: Set all bits in cbm -	 * For Memory Allocation: Set b/w requested to 100 -	 */ -	for (i = 0; i < r->num_closid; i++, dc++) -		*dc = r->default_ctrl; +	d->ctrl_val = dc; +	d->mbps_val = dm; +	setup_default_ctrlval(r, dc, dm);  	m.low = 0;  	m.high = r->num_closid; @@ -588,6 +611,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)  		}  		kfree(d->ctrl_val); +		kfree(d->mbps_val);  		kfree(d->rmid_busy_llc);  		kfree(d->mbm_total);  		kfree(d->mbm_local); @@ -821,6 +845,8 @@ static __init void rdt_quirks(void)  	case INTEL_FAM6_SKYLAKE_X:  		if (boot_cpu_data.x86_stepping <= 4)  			set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); +		else +			set_rdt_options("!l3cat");  	}  } diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 3fd7a70ee04a..39752825e376 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -28,6 +28,7 @@  #define MBM_CNTR_WIDTH			24  #define MBM_OVERFLOW_INTERVAL		1000 +#define MAX_MBA_BW			100u  #define RMID_VAL_ERROR			BIT_ULL(63)  #define RMID_VAL_UNAVAIL		BIT_ULL(62) @@ -180,10 +181,20 @@ struct rftype {   * struct mbm_state - status for each MBM counter in each domain   * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)   * @prev_msr	Value of IA32_QM_CTR for this RMID last time we read it + * @chunks_bw	Total local data moved. Used for bandwidth calculation + * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting + * @prev_bw	The most recent bandwidth in MBps + * @delta_bw	Difference between the current and previous bandwidth + * @delta_comp	Indicates whether to compute the delta_bw   */  struct mbm_state {  	u64	chunks;  	u64	prev_msr; +	u64	chunks_bw; +	u64	prev_bw_msr; +	u32	prev_bw; +	u32	delta_bw; +	bool	delta_comp;  };  /** @@ -202,6 +213,7 @@ struct mbm_state {   * @cqm_work_cpu:   *		worker cpu for CQM h/w counters   * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID) + * @mbps_val:	When mba_sc is enabled, this holds the bandwidth in MBps   * @new_ctrl:	new ctrl value to be loaded   * @have_new_ctrl: did user provide new_ctrl for this domain   */ @@ -217,6 +229,7 @@ struct rdt_domain {  	int			mbm_work_cpu;  	int			cqm_work_cpu;  	u32			*ctrl_val; +	u32			*mbps_val;  	u32			new_ctrl;  	bool			have_new_ctrl;  }; @@ -259,6 +272,7 @@ struct rdt_cache {   * @min_bw:		Minimum memory bandwidth percentage user can request   * @bw_gran:		Granularity at which the memory bandwidth is allocated   * @delay_linear:	True if memory B/W delay is in linear scale + * @mba_sc:		True if MBA software controller(mba_sc) is enabled   * @mb_map:		Mapping of memory B/W percentage to memory B/W delay   */  struct rdt_membw { @@ -266,6 +280,7 @@ struct rdt_membw {  	u32		min_bw;  	u32		bw_gran;  	u32		delay_linear; +	bool		mba_sc;  	u32		*mb_map;  }; @@ -445,6 +460,9 @@ void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,  void mbm_setup_overflow_handler(struct rdt_domain *dom,  				unsigned long delay_ms);  void mbm_handle_overflow(struct work_struct *work); +bool is_mba_sc(struct rdt_resource *r); +void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); +u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);  void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);  void cqm_handle_limbo(struct work_struct *work);  bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 23e1d5c249c6..116d57b248d3 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -53,7 +53,8 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)  		return false;  	} -	if (bw < r->membw.min_bw || bw > r->default_ctrl) { +	if ((bw < r->membw.min_bw || bw > r->default_ctrl) && +	    !is_mba_sc(r)) {  		rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,  				    r->membw.min_bw, r->default_ctrl);  		return false; @@ -179,6 +180,8 @@ static int update_domains(struct rdt_resource *r, int closid)  	struct msr_param msr_param;  	cpumask_var_t cpu_mask;  	struct rdt_domain *d; +	bool mba_sc; +	u32 *dc;  	int cpu;  	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) @@ -188,13 +191,20 @@ static int update_domains(struct rdt_resource *r, int closid)  	msr_param.high = msr_param.low + 1;  	msr_param.res = r; +	mba_sc = is_mba_sc(r);  	list_for_each_entry(d, &r->domains, list) { -		if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) { +		dc = !mba_sc ? d->ctrl_val : d->mbps_val; +		if (d->have_new_ctrl && d->new_ctrl != dc[closid]) {  			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); -			d->ctrl_val[closid] = d->new_ctrl; +			dc[closid] = d->new_ctrl;  		}  	} -	if (cpumask_empty(cpu_mask)) + +	/* +	 * Avoid writing the control msr with control values when +	 * MBA software controller is enabled +	 */ +	if (cpumask_empty(cpu_mask) || mba_sc)  		goto done;  	cpu = get_cpu();  	/* Update CBM on this cpu if it's in cpu_mask. */ @@ -282,13 +292,17 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)  {  	struct rdt_domain *dom;  	bool sep = false; +	u32 ctrl_val;  	seq_printf(s, "%*s:", max_name_width, r->name);  	list_for_each_entry(dom, &r->domains, list) {  		if (sep)  			seq_puts(s, ";"); + +		ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] : +			    dom->mbps_val[closid]);  		seq_printf(s, r->format_str, dom->id, max_data_width, -			   dom->ctrl_val[closid]); +			   ctrl_val);  		sep = true;  	}  	seq_puts(s, "\n"); diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c index 681450eee428..b0f3aed76b75 100644 --- a/arch/x86/kernel/cpu/intel_rdt_monitor.c +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -225,10 +225,18 @@ void free_rmid(u32 rmid)  		list_add_tail(&entry->list, &rmid_free_lru);  } +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr) +{ +	u64 shift = 64 - MBM_CNTR_WIDTH, chunks; + +	chunks = (cur_msr << shift) - (prev_msr << shift); +	return chunks >>= shift; +} +  static int __mon_event_count(u32 rmid, struct rmid_read *rr)  { -	u64 chunks, shift, tval;  	struct mbm_state *m; +	u64 chunks, tval;  	tval = __rmid_read(rmid, rr->evtid);  	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { @@ -254,14 +262,12 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)  	}  	if (rr->first) { -		m->prev_msr = tval; -		m->chunks = 0; +		memset(m, 0, sizeof(struct mbm_state)); +		m->prev_bw_msr = m->prev_msr = tval;  		return 0;  	} -	shift = 64 - MBM_CNTR_WIDTH; -	chunks = (tval << shift) - (m->prev_msr << shift); -	chunks >>= shift; +	chunks = mbm_overflow_count(m->prev_msr, tval);  	m->chunks += chunks;  	m->prev_msr = tval; @@ -270,6 +276,32 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)  }  /* + * Supporting function to calculate the memory bandwidth + * and delta bandwidth in MBps. + */ +static void mbm_bw_count(u32 rmid, struct rmid_read *rr) +{ +	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; +	struct mbm_state *m = &rr->d->mbm_local[rmid]; +	u64 tval, cur_bw, chunks; + +	tval = __rmid_read(rmid, rr->evtid); +	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) +		return; + +	chunks = mbm_overflow_count(m->prev_bw_msr, tval); +	m->chunks_bw += chunks; +	m->chunks = m->chunks_bw; +	cur_bw = (chunks * r->mon_scale) >> 20; + +	if (m->delta_comp) +		m->delta_bw = abs(cur_bw - m->prev_bw); +	m->delta_comp = false; +	m->prev_bw = cur_bw; +	m->prev_bw_msr = tval; +} + +/*   * This is called via IPI to read the CQM/MBM counters   * on a domain.   */ @@ -297,6 +329,118 @@ void mon_event_count(void *info)  	}  } +/* + * Feedback loop for MBA software controller (mba_sc) + * + * mba_sc is a feedback loop where we periodically read MBM counters and + * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so + * that: + * + *   current bandwdith(cur_bw) < user specified bandwidth(user_bw) + * + * This uses the MBM counters to measure the bandwidth and MBA throttle + * MSRs to control the bandwidth for a particular rdtgrp. It builds on the + * fact that resctrl rdtgroups have both monitoring and control. + * + * The frequency of the checks is 1s and we just tag along the MBM overflow + * timer. Having 1s interval makes the calculation of bandwidth simpler. + * + * Although MBA's goal is to restrict the bandwidth to a maximum, there may + * be a need to increase the bandwidth to avoid uncecessarily restricting + * the L2 <-> L3 traffic. + * + * Since MBA controls the L2 external bandwidth where as MBM measures the + * L3 external bandwidth the following sequence could lead to such a + * situation. + * + * Consider an rdtgroup which had high L3 <-> memory traffic in initial + * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but + * after some time rdtgroup has mostly L2 <-> L3 traffic. + * + * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its + * throttle MSRs already have low percentage values.  To avoid + * unnecessarily restricting such rdtgroups, we also increase the bandwidth. + */ +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) +{ +	u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val; +	struct mbm_state *pmbm_data, *cmbm_data; +	u32 cur_bw, delta_bw, user_bw; +	struct rdt_resource *r_mba; +	struct rdt_domain *dom_mba; +	struct list_head *head; +	struct rdtgroup *entry; + +	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA]; +	closid = rgrp->closid; +	rmid = rgrp->mon.rmid; +	pmbm_data = &dom_mbm->mbm_local[rmid]; + +	dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); +	if (!dom_mba) { +		pr_warn_once("Failure to get domain for MBA update\n"); +		return; +	} + +	cur_bw = pmbm_data->prev_bw; +	user_bw = dom_mba->mbps_val[closid]; +	delta_bw = pmbm_data->delta_bw; +	cur_msr_val = dom_mba->ctrl_val[closid]; + +	/* +	 * For Ctrl groups read data from child monitor groups. +	 */ +	head = &rgrp->mon.crdtgrp_list; +	list_for_each_entry(entry, head, mon.crdtgrp_list) { +		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; +		cur_bw += cmbm_data->prev_bw; +		delta_bw += cmbm_data->delta_bw; +	} + +	/* +	 * Scale up/down the bandwidth linearly for the ctrl group.  The +	 * bandwidth step is the bandwidth granularity specified by the +	 * hardware. +	 * +	 * The delta_bw is used when increasing the bandwidth so that we +	 * dont alternately increase and decrease the control values +	 * continuously. +	 * +	 * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if +	 * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep +	 * switching between 90 and 110 continuously if we only check +	 * cur_bw < user_bw. +	 */ +	if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { +		new_msr_val = cur_msr_val - r_mba->membw.bw_gran; +	} else if (cur_msr_val < MAX_MBA_BW && +		   (user_bw > (cur_bw + delta_bw))) { +		new_msr_val = cur_msr_val + r_mba->membw.bw_gran; +	} else { +		return; +	} + +	cur_msr = r_mba->msr_base + closid; +	wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba)); +	dom_mba->ctrl_val[closid] = new_msr_val; + +	/* +	 * Delta values are updated dynamically package wise for each +	 * rdtgrp everytime the throttle MSR changes value. +	 * +	 * This is because (1)the increase in bandwidth is not perfectly +	 * linear and only "approximately" linear even when the hardware +	 * says it is linear.(2)Also since MBA is a core specific +	 * mechanism, the delta values vary based on number of cores used +	 * by the rdtgrp. +	 */ +	pmbm_data->delta_comp = true; +	list_for_each_entry(entry, head, mon.crdtgrp_list) { +		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; +		cmbm_data->delta_comp = true; +	} +} +  static void mbm_update(struct rdt_domain *d, int rmid)  {  	struct rmid_read rr; @@ -314,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid)  	}  	if (is_mbm_local_enabled()) {  		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; -		__mon_event_count(rmid, &rr); + +		/* +		 * Call the MBA software controller only for the +		 * control groups and when user has enabled +		 * the software controller explicitly. +		 */ +		if (!is_mba_sc(NULL)) +			__mon_event_count(rmid, &rr); +		else +			mbm_bw_count(rmid, &rr);  	}  } @@ -385,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work)  		head = &prgrp->mon.crdtgrp_list;  		list_for_each_entry(crgrp, head, mon.crdtgrp_list)  			mbm_update(d, crgrp->mon.rmid); + +		if (is_mba_sc(NULL)) +			update_mba_bw(prgrp, d);  	}  	schedule_delayed_work_on(cpu, &d->mbm_over, delay); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index fca759d272a1..749856a2e736 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1005,6 +1005,11 @@ static void l2_qos_cfg_update(void *arg)  	wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);  } +static inline bool is_mba_linear(void) +{ +	return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear; +} +  static int set_cache_qos_cfg(int level, bool enable)  {  	void (*update)(void *arg); @@ -1041,6 +1046,28 @@ static int set_cache_qos_cfg(int level, bool enable)  	return 0;  } +/* + * Enable or disable the MBA software controller + * which helps user specify bandwidth in MBps. + * MBA software controller is supported only if + * MBM is supported and MBA is in linear scale. + */ +static int set_mba_sc(bool mba_sc) +{ +	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA]; +	struct rdt_domain *d; + +	if (!is_mbm_enabled() || !is_mba_linear() || +	    mba_sc == is_mba_sc(r)) +		return -EINVAL; + +	r->membw.mba_sc = mba_sc; +	list_for_each_entry(d, &r->domains, list) +		setup_default_ctrlval(r, d->ctrl_val, d->mbps_val); + +	return 0; +} +  static int cdp_enable(int level, int data_type, int code_type)  {  	struct rdt_resource *r_ldata = &rdt_resources_all[data_type]; @@ -1123,6 +1150,10 @@ static int parse_rdtgroupfs_options(char *data)  			ret = cdpl2_enable();  			if (ret)  				goto out; +		} else if (!strcmp(token, "mba_MBps")) { +			ret = set_mba_sc(true); +			if (ret) +				goto out;  		} else {  			ret = -EINVAL;  			goto out; @@ -1445,6 +1476,8 @@ static void rdt_kill_sb(struct super_block *sb)  	cpus_read_lock();  	mutex_lock(&rdtgroup_mutex); +	set_mba_sc(false); +  	/*Put everything back to default values. */  	for_each_alloc_enabled_rdt_resource(r)  		reset_all_ctrls(r); diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 475cb4f5f14f..c805a06e14c3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -48,7 +48,7 @@ static struct dentry *dfs_inj;  static u8 n_banks; -#define MAX_FLAG_OPT_SIZE	3 +#define MAX_FLAG_OPT_SIZE	4  #define NBCFG			0x44  enum injection_type { diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 5bbd06f38ff6..f34d89c01edc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -160,6 +160,11 @@ static struct severity {  		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),  		USER  		), +	MCESEV( +		PANIC, "Data load in unrecoverable area of kernel", +		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), +		KERNEL +		),  #endif  	MCESEV(  		PANIC, "Action required: unknown MCACOD", diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 42cf2880d0ed..c102ad51025e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -772,23 +772,25 @@ EXPORT_SYMBOL_GPL(machine_check_poll);  static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,  			  struct pt_regs *regs)  { -	int i, ret = 0;  	char *tmp; +	int i;  	for (i = 0; i < mca_cfg.banks; i++) {  		m->status = mce_rdmsrl(msr_ops.status(i)); -		if (m->status & MCI_STATUS_VAL) { -			__set_bit(i, validp); -			if (quirk_no_way_out) -				quirk_no_way_out(i, m, regs); -		} +		if (!(m->status & MCI_STATUS_VAL)) +			continue; + +		__set_bit(i, validp); +		if (quirk_no_way_out) +			quirk_no_way_out(i, m, regs);  		if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { +			mce_read_aux(m, i);  			*msg = tmp; -			ret = 1; +			return 1;  		}  	} -	return ret; +	return 0;  }  /* @@ -1205,13 +1207,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)  		lmce = m.mcgstatus & MCG_STATUS_LMCES;  	/* +	 * Local machine check may already know that we have to panic. +	 * Broadcast machine check begins rendezvous in mce_start()  	 * Go through all banks in exclusion of the other CPUs. This way we  	 * don't report duplicated events on shared banks because the first one -	 * to see it will clear it. If this is a Local MCE, then no need to -	 * perform rendezvous. +	 * to see it will clear it.  	 */ -	if (!lmce) +	if (lmce) { +		if (no_way_out) +			mce_panic("Fatal local machine check", &m, msg); +	} else {  		order = mce_start(&no_way_out); +	}  	for (i = 0; i < cfg->banks; i++) {  		__clear_bit(i, toclear); @@ -1287,12 +1294,17 @@ void do_machine_check(struct pt_regs *regs, long error_code)  			no_way_out = worst >= MCE_PANIC_SEVERITY;  	} else {  		/* -		 * Local MCE skipped calling mce_reign() -		 * If we found a fatal error, we need to panic here. +		 * If there was a fatal machine check we should have +		 * already called mce_panic earlier in this function. +		 * Since we re-read the banks, we might have found +		 * something new. Check again to see if we found a +		 * fatal error. We call "mce_severity()" again to +		 * make sure we have the right "msg".  		 */ -		 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) -			mce_panic("Machine check from unknown source", -				NULL, NULL); +		if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { +			mce_severity(&m, cfg->tolerant, &msg, true); +			mce_panic("Local fatal machine check!", &m, msg); +		}  	}  	/* @@ -1457,7 +1469,7 @@ static int __mcheck_cpu_mce_banks_init(void)  	int i;  	u8 num_banks = mca_cfg.banks; -	mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); +	mce_banks = kcalloc(num_banks, sizeof(struct mce_bank), GFP_KERNEL);  	if (!mce_banks)  		return -ENOMEM; @@ -1727,6 +1739,21 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)  	}  } +static void mce_centaur_feature_init(struct cpuinfo_x86 *c) +{ +	struct mca_config *cfg = &mca_cfg; + +	 /* +	  * All newer Centaur CPUs support MCE broadcasting. Enable +	  * synchronization with a one second timeout. +	  */ +	if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) || +	     c->x86 > 6) { +		if (cfg->monarch_timeout < 0) +			cfg->monarch_timeout = USEC_PER_SEC; +	} +} +  static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)  {  	switch (c->x86_vendor) { @@ -1739,6 +1766,9 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)  		mce_amd_feature_init(c);  		break;  		} +	case X86_VENDOR_CENTAUR: +		mce_centaur_feature_init(c); +		break;  	default:  		break; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f7666eef4a87..dd33c357548f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -94,6 +94,11 @@ static struct smca_bank_name smca_names[] = {  	[SMCA_SMU]	= { "smu",		"System Management Unit" },  }; +static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init = +{ +	[0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 } +}; +  const char *smca_get_name(enum smca_bank_types t)  {  	if (t >= N_SMCA_BANK_TYPES) @@ -431,8 +436,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)  	wrmsr(MSR_CU_DEF_ERR, low, high);  } -static u32 smca_get_block_address(unsigned int cpu, unsigned int bank, -				  unsigned int block) +static u32 smca_get_block_address(unsigned int bank, unsigned int block)  {  	u32 low, high;  	u32 addr = 0; @@ -443,24 +447,30 @@ static u32 smca_get_block_address(unsigned int cpu, unsigned int bank,  	if (!block)  		return MSR_AMD64_SMCA_MCx_MISC(bank); +	/* Check our cache first: */ +	if (smca_bank_addrs[bank][block] != -1) +		return smca_bank_addrs[bank][block]; +  	/*  	 * For SMCA enabled processors, BLKPTR field of the first MISC register  	 * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).  	 */ -	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) -		return addr; +	if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) +		goto out;  	if (!(low & MCI_CONFIG_MCAX)) -		return addr; +		goto out; -	if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && +	if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&  	    (low & MASK_BLKPTR_LO)) -		return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); +		addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); +out: +	smca_bank_addrs[bank][block] = addr;  	return addr;  } -static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high, +static u32 get_block_address(u32 current_addr, u32 low, u32 high,  			     unsigned int bank, unsigned int block)  {  	u32 addr = 0, offset = 0; @@ -468,20 +478,8 @@ static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 hi  	if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))  		return addr; -	/* Get address from already initialized block. */ -	if (per_cpu(threshold_banks, cpu)) { -		struct threshold_bank *bankp = per_cpu(threshold_banks, cpu)[bank]; - -		if (bankp && bankp->blocks) { -			struct threshold_block *blockp = &bankp->blocks[block]; - -			if (blockp) -				return blockp->address; -		} -	} -  	if (mce_flags.smca) -		return smca_get_block_address(cpu, bank, block); +		return smca_get_block_address(bank, block);  	/* Fall back to method we used for older processors: */  	switch (block) { @@ -559,7 +557,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)  			smca_configure(bank, cpu);  		for (block = 0; block < NR_BLOCKS; ++block) { -			address = get_block_address(cpu, address, low, high, bank, block); +			address = get_block_address(address, low, high, bank, block);  			if (!address)  				break; @@ -1176,7 +1174,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,  	if (err)  		goto out_free;  recurse: -	address = get_block_address(cpu, address, low, high, bank, ++block); +	address = get_block_address(address, low, high, bank, ++block);  	if (!address)  		return 0; @@ -1386,7 +1384,7 @@ int mce_threshold_create_device(unsigned int cpu)  	if (bp)  		return 0; -	bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, +	bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *),  		     GFP_KERNEL);  	if (!bp)  		return -ENOMEM; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 77e201301528..08286269fd24 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -70,7 +70,7 @@ static DEFINE_MUTEX(microcode_mutex);  /*   * Serialize late loading so that CPUs get updated one-by-one.   */ -static DEFINE_SPINLOCK(update_lock); +static DEFINE_RAW_SPINLOCK(update_lock);  struct ucode_cpu_info		ucode_cpu_info[NR_CPUS]; @@ -560,9 +560,9 @@ static int __reload_late(void *info)  	if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))  		return -1; -	spin_lock(&update_lock); +	raw_spin_lock(&update_lock);  	apply_microcode_local(&err); -	spin_unlock(&update_lock); +	raw_spin_unlock(&update_lock);  	/* siblings return UCODE_OK because their engine got updated already */  	if (err > UCODE_NFOUND) { diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 1c2cfa0644aa..97ccf4c3b45b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -190,8 +190,11 @@ static void save_microcode_patch(void *data, unsigned int size)  			p = memdup_patch(data, size);  			if (!p)  				pr_err("Error allocating buffer %p\n", data); -			else +			else {  				list_replace(&iter->plist, &p->plist); +				kfree(iter->data); +				kfree(iter); +			}  		}  	} diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index ad9e5ed81181..2ad9107ee980 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,3 @@ -obj-y		:= main.o if.o generic.o cleanup.o +obj-y		:= mtrr.o if.o generic.o cleanup.o  obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 558444b23923..4021d3859499 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -43,7 +43,7 @@ mtrr_file_add(unsigned long base, unsigned long size,  	max = num_var_ranges;  	if (fcount == NULL) { -		fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL); +		fcount = kcalloc(max, sizeof(*fcount), GFP_KERNEL);  		if (!fcount)  			return -ENOMEM;  		FILE_FCOUNT(file) = fcount; @@ -106,17 +106,9 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)  	memset(line, 0, LINE_SIZE); -	length = len; -	length--; - -	if (length > LINE_SIZE - 1) -		length = LINE_SIZE - 1; - +	length = strncpy_from_user(line, buf, LINE_SIZE - 1);  	if (length < 0) -		return -EINVAL; - -	if (copy_from_user(line, buf, length)) -		return -EFAULT; +		return length;  	linelen = strlen(line);  	ptr = line + linelen - 1; @@ -149,17 +141,16 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)  		return -EINVAL;  	ptr = skip_spaces(ptr + 5); -	for (i = 0; i < MTRR_NUM_TYPES; ++i) { -		if (strcmp(ptr, mtrr_strings[i])) -			continue; -		base >>= PAGE_SHIFT; -		size >>= PAGE_SHIFT; -		err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); -		if (err < 0) -			return err; -		return len; -	} -	return -EINVAL; +	i = match_string(mtrr_strings, MTRR_NUM_TYPES, ptr); +	if (i < 0) +		return i; + +	base >>= PAGE_SHIFT; +	size >>= PAGE_SHIFT; +	err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); +	if (err < 0) +		return err; +	return len;  }  static long diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 7468de429087..9a19c800fe40 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -46,6 +46,7 @@  #include <linux/pci.h>  #include <linux/smp.h>  #include <linux/syscore_ops.h> +#include <linux/rcupdate.h>  #include <asm/cpufeature.h>  #include <asm/e820/api.h> @@ -100,7 +101,7 @@ static int have_wrcomb(void)  		if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&  		    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&  		    dev->revision <= 5) { -			pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); +			pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n");  			pci_dev_put(dev);  			return 0;  		} @@ -110,7 +111,7 @@ static int have_wrcomb(void)  		 */  		if (dev->vendor == PCI_VENDOR_ID_INTEL &&  		    dev->device == PCI_DEVICE_ID_INTEL_82451NX) { -			pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); +			pr_info("Intel 450NX MMC detected. Write-combining disabled.\n");  			pci_dev_put(dev);  			return 0;  		} @@ -312,24 +313,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,  		return error;  	if (type >= MTRR_NUM_TYPES) { -		pr_warn("mtrr: type: %u invalid\n", type); +		pr_warn("type: %u invalid\n", type);  		return -EINVAL;  	}  	/* If the type is WC, check that this processor supports it */  	if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { -		pr_warn("mtrr: your processor doesn't support write-combining\n"); +		pr_warn("your processor doesn't support write-combining\n");  		return -ENOSYS;  	}  	if (!size) { -		pr_warn("mtrr: zero sized request\n"); +		pr_warn("zero sized request\n");  		return -EINVAL;  	}  	if ((base | (base + size - 1)) >>  	    (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) { -		pr_warn("mtrr: base or size exceeds the MTRR width\n"); +		pr_warn("base or size exceeds the MTRR width\n");  		return -EINVAL;  	} @@ -360,8 +361,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,  				} else if (types_compatible(type, ltype))  					continue;  			} -			pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing" -				" 0x%lx000,0x%lx000\n", base, size, lbase, +			pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase,  				lsize);  			goto out;  		} @@ -369,7 +369,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,  		if (ltype != type) {  			if (types_compatible(type, ltype))  				continue; -			pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", +			pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n",  				base, size, mtrr_attrib_to_str(ltype),  				mtrr_attrib_to_str(type));  			goto out; @@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,  			}  		}  	} else { -		pr_info("mtrr: no more MTRRs available\n"); +		pr_info("no more MTRRs available\n");  	}  	error = i;   out: @@ -407,8 +407,8 @@ int mtrr_add_page(unsigned long base, unsigned long size,  static int mtrr_check(unsigned long base, unsigned long size)  {  	if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { -		pr_warn("mtrr: size and base must be multiples of 4 kiB\n"); -		pr_debug("mtrr: size: 0x%lx  base: 0x%lx\n", size, base); +		pr_warn("size and base must be multiples of 4 kiB\n"); +		pr_debug("size: 0x%lx  base: 0x%lx\n", size, base);  		dump_stack();  		return -1;  	} @@ -499,22 +499,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)  			}  		}  		if (reg < 0) { -			pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", +			pr_debug("no MTRR for %lx000,%lx000 found\n",  				 base, size);  			goto out;  		}  	}  	if (reg >= max) { -		pr_warn("mtrr: register: %d too big\n", reg); +		pr_warn("register: %d too big\n", reg);  		goto out;  	}  	mtrr_if->get(reg, &lbase, &lsize, <ype);  	if (lsize < 1) { -		pr_warn("mtrr: MTRR %d not used\n", reg); +		pr_warn("MTRR %d not used\n", reg);  		goto out;  	}  	if (mtrr_usage_table[reg] < 1) { -		pr_warn("mtrr: reg: %d has count=0\n", reg); +		pr_warn("reg: %d has count=0\n", reg);  		goto out;  	}  	if (--mtrr_usage_table[reg] < 1) @@ -775,7 +775,7 @@ void __init mtrr_bp_init(void)  	}  	if (!mtrr_enabled()) { -		pr_info("MTRR: Disabled\n"); +		pr_info("Disabled\n");  		/*  		 * PAT initialization relies on MTRR's rendezvous handler. @@ -793,6 +793,9 @@ void mtrr_ap_init(void)  	if (!use_intel() || mtrr_aps_delayed_init)  		return; + +	rcu_cpu_starting(smp_processor_id()); +  	/*  	 * Ideally we should hold mtrr_mutex here to avoid mtrr entries  	 * changed, but this routine will be called in cpu boot time, diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index b099024d339c..81c0afb39d0a 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -27,7 +27,7 @@   * exists, use it for populating initial_apicid and cpu topology   * detection.   */ -void detect_extended_topology(struct cpuinfo_x86 *c) +int detect_extended_topology(struct cpuinfo_x86 *c)  {  #ifdef CONFIG_SMP  	unsigned int eax, ebx, ecx, edx, sub_index; @@ -36,7 +36,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c)  	static bool printed;  	if (c->cpuid_level < 0xb) -		return; +		return -1;  	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); @@ -44,7 +44,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c)  	 * check if the cpuid leaf 0xb is actually implemented.  	 */  	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) -		return; +		return -1;  	set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); @@ -95,6 +95,6 @@ void detect_extended_topology(struct cpuinfo_x86 *c)  			       c->cpu_core_id);  		printed = 1;  	} -	return;  #endif +	return 0;  } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 18fa9d74c182..666a284116ac 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,11 +22,14 @@  #include <asm/stacktrace.h>  #include <asm/unwind.h> +#define OPCODE_BUFSIZE 64 +  int panic_on_unrecovered_nmi;  int panic_on_io_nmi; -static unsigned int code_bytes = 64;  static int die_counter; +static struct pt_regs exec_summary_regs; +  bool in_task_stack(unsigned long *stack, struct task_struct *task,  		   struct stack_info *info)  { @@ -69,9 +72,62 @@ static void printk_stack_address(unsigned long address, int reliable,  	printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);  } +/* + * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus: + * + * In case where we don't have the exact kernel image (which, if we did, we can + * simply disassemble and navigate to the RIP), the purpose of the bigger + * prologue is to have more context and to be able to correlate the code from + * the different toolchains better. + * + * In addition, it helps in recreating the register allocation of the failing + * kernel and thus make sense of the register dump. + * + * What is more, the additional complication of a variable length insn arch like + * x86 warrants having longer byte sequence before rIP so that the disassembler + * can "sync" up properly and find instruction boundaries when decoding the + * opcode bytes. + * + * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random + * guesstimate in attempt to achieve all of the above. + */ +void show_opcodes(u8 *rip, const char *loglvl) +{ +	unsigned int code_prologue = OPCODE_BUFSIZE * 2 / 3; +	u8 opcodes[OPCODE_BUFSIZE]; +	u8 *ip; +	int i; + +	printk("%sCode: ", loglvl); + +	ip = (u8 *)rip - code_prologue; +	if (probe_kernel_read(opcodes, ip, OPCODE_BUFSIZE)) { +		pr_cont("Bad RIP value.\n"); +		return; +	} + +	for (i = 0; i < OPCODE_BUFSIZE; i++, ip++) { +		if (ip == rip) +			pr_cont("<%02x> ", opcodes[i]); +		else +			pr_cont("%02x ", opcodes[i]); +	} +	pr_cont("\n"); +} + +void show_ip(struct pt_regs *regs, const char *loglvl) +{ +#ifdef CONFIG_X86_32 +	printk("%sEIP: %pS\n", loglvl, (void *)regs->ip); +#else +	printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip); +#endif +	show_opcodes((u8 *)regs->ip, loglvl); +} +  void show_iret_regs(struct pt_regs *regs)  { -	printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); +	show_ip(regs, KERN_DEFAULT);  	printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,  		regs->sp, regs->flags);  } @@ -267,7 +323,6 @@ unsigned long oops_begin(void)  	bust_spinlocks(1);  	return flags;  } -EXPORT_SYMBOL_GPL(oops_begin);  NOKPROBE_SYMBOL(oops_begin);  void __noreturn rewind_stack_do_exit(int signr); @@ -287,6 +342,9 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)  	raw_local_irq_restore(flags);  	oops_exit(); +	/* Executive summary in case the oops scrolled away */ +	__show_regs(&exec_summary_regs, true); +  	if (!signr)  		return;  	if (in_interrupt()) @@ -305,10 +363,10 @@ NOKPROBE_SYMBOL(oops_end);  int __die(const char *str, struct pt_regs *regs, long err)  { -#ifdef CONFIG_X86_32 -	unsigned short ss; -	unsigned long sp; -#endif +	/* Save the regs of the first oops for the executive summary later. */ +	if (!die_counter) +		exec_summary_regs = *regs; +  	printk(KERN_DEFAULT  	       "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,  	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "", @@ -318,26 +376,13 @@ int __die(const char *str, struct pt_regs *regs, long err)  	       IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?  	       (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); +	show_regs(regs); +	print_modules(); +  	if (notify_die(DIE_OOPS, str, regs, err,  			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)  		return 1; -	print_modules(); -	show_regs(regs); -#ifdef CONFIG_X86_32 -	if (user_mode(regs)) { -		sp = regs->sp; -		ss = regs->ss; -	} else { -		sp = kernel_stack_pointer(regs); -		savesegment(ss, ss); -	} -	printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n", -	       (void *)regs->ip, ss, sp); -#else -	/* Executive summary in case the oops scrolled away */ -	printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp); -#endif  	return 0;  }  NOKPROBE_SYMBOL(__die); @@ -356,30 +401,9 @@ void die(const char *str, struct pt_regs *regs, long err)  	oops_end(flags, regs, sig);  } -static int __init code_bytes_setup(char *s) -{ -	ssize_t ret; -	unsigned long val; - -	if (!s) -		return -EINVAL; - -	ret = kstrtoul(s, 0, &val); -	if (ret) -		return ret; - -	code_bytes = val; -	if (code_bytes > 8192) -		code_bytes = 8192; - -	return 1; -} -__setup("code_bytes=", code_bytes_setup); -  void show_regs(struct pt_regs *regs)  {  	bool all = true; -	int i;  	show_regs_print_info(KERN_DEFAULT); @@ -389,36 +413,8 @@ void show_regs(struct pt_regs *regs)  	__show_regs(regs, all);  	/* -	 * When in-kernel, we also print out the stack and code at the -	 * time of the fault.. +	 * When in-kernel, we also print out the stack at the time of the fault..  	 */ -	if (!user_mode(regs)) { -		unsigned int code_prologue = code_bytes * 43 / 64; -		unsigned int code_len = code_bytes; -		unsigned char c; -		u8 *ip; - +	if (!user_mode(regs))  		show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); - -		printk(KERN_DEFAULT "Code: "); - -		ip = (u8 *)regs->ip - code_prologue; -		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { -			/* try starting at IP */ -			ip = (u8 *)regs->ip; -			code_len = code_len - code_prologue + 1; -		} -		for (i = 0; i < code_len; i++, ip++) { -			if (ip < (u8 *)PAGE_OFFSET || -					probe_kernel_address(ip, c)) { -				pr_cont(" Bad RIP value."); -				break; -			} -			if (ip == (u8 *)regs->ip) -				pr_cont("<%02x> ", c); -			else -				pr_cont("%02x ", c); -		} -	} -	pr_cont("\n");  } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 6a2cb1442e05..c88c23c658c1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -155,7 +155,8 @@ static void __init __e820__range_add(struct e820_table *table, u64 start, u64 si  	int x = table->nr_entries;  	if (x >= ARRAY_SIZE(table->entries)) { -		pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1); +		pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n", +		       start, start + size - 1);  		return;  	} @@ -190,9 +191,10 @@ void __init e820__print_table(char *who)  	int i;  	for (i = 0; i < e820_table->nr_entries; i++) { -		pr_info("%s: [mem %#018Lx-%#018Lx] ", who, -		       e820_table->entries[i].addr, -		       e820_table->entries[i].addr + e820_table->entries[i].size - 1); +		pr_info("%s: [mem %#018Lx-%#018Lx] ", +			who, +			e820_table->entries[i].addr, +			e820_table->entries[i].addr + e820_table->entries[i].size - 1);  		e820_print_type(e820_table->entries[i].type);  		pr_cont("\n"); @@ -574,7 +576,7 @@ void __init e820__update_table_print(void)  	if (e820__update_table(e820_table))  		return; -	pr_info("e820: modified physical RAM map:\n"); +	pr_info("modified physical RAM map:\n");  	e820__print_table("modified");  } @@ -636,9 +638,8 @@ __init void e820__setup_pci_gap(void)  	if (!found) {  #ifdef CONFIG_X86_64  		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; -		pr_err( -			"e820: Cannot find an available gap in the 32-bit address range\n" -			"e820: PCI devices with unassigned 32-bit BARs may not work!\n"); +		pr_err("Cannot find an available gap in the 32-bit address range\n"); +		pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");  #else  		gapstart = 0x10000000;  #endif @@ -649,7 +650,8 @@ __init void e820__setup_pci_gap(void)  	 */  	pci_mem_start = gapstart; -	pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1); +	pr_info("[mem %#010lx-%#010lx] available for PCI devices\n", +		gapstart, gapstart + gapsize - 1);  }  /* @@ -711,7 +713,7 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)  	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));  	early_memunmap(sdata, data_len); -	pr_info("e820: extended physical RAM map:\n"); +	pr_info("extended physical RAM map:\n");  	e820__print_table("extended");  } @@ -780,7 +782,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)  	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);  	if (addr) {  		e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); -		pr_info("e820: update e820_table_kexec for e820__memblock_alloc_reserved()\n"); +		pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");  		e820__update_table_kexec();  	} @@ -830,8 +832,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type  	if (last_pfn > max_arch_pfn)  		last_pfn = max_arch_pfn; -	pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n", -			 last_pfn, max_arch_pfn); +	pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n", +		last_pfn, max_arch_pfn);  	return last_pfn;  } @@ -1005,7 +1007,7 @@ void __init e820__finish_early_params(void)  		if (e820__update_table(e820_table) < 0)  			early_panic("Invalid user supplied memory map"); -		pr_info("e820: user-defined physical RAM map:\n"); +		pr_info("user-defined physical RAM map:\n");  		e820__print_table("user");  	}  } @@ -1238,7 +1240,7 @@ void __init e820__memory_setup(void)  	memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));  	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); -	pr_info("e820: BIOS-provided physical RAM map:\n"); +	pr_info("BIOS-provided physical RAM map:\n");  	e820__print_table(who);  } @@ -1246,6 +1248,7 @@ void __init e820__memblock_setup(void)  {  	int i;  	u64 end; +	u64 addr = 0;  	/*  	 * The bootstrap memblock region count maximum is 128 entries @@ -1262,13 +1265,21 @@ void __init e820__memblock_setup(void)  		struct e820_entry *entry = &e820_table->entries[i];  		end = entry->addr + entry->size; +		if (addr < entry->addr) +			memblock_reserve(addr, entry->addr - addr); +		addr = end;  		if (end != (resource_size_t)end)  			continue; +		/* +		 * all !E820_TYPE_RAM ranges (including gap ranges) are put +		 * into memblock.reserved to make sure that struct pages in +		 * such regions are not left uninitialized after bootup. +		 */  		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) -			continue; - -		memblock_add(entry->addr, entry->size); +			memblock_reserve(entry->addr, entry->size); +		else +			memblock_add(entry->addr, entry->size);  	}  	/* Throw away partial pages: */ diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 72c2cf961d44..50d5848bf22e 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -28,8 +28,6 @@  #include <asm/irq_remapping.h>  #include <asm/early_ioremap.h> -#define dev_err(msg)  pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg) -  static void __init fix_hypertransport_config(int num, int slot, int func)  {  	u32 htcfg; @@ -635,7 +633,8 @@ static void __init apple_airport_reset(int bus, int slot, int func)  		pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);  		if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { -			dev_err("Cannot power up Apple AirPort card\n"); +			pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n", +			       bus, slot, func);  			return;  		}  	} @@ -646,7 +645,8 @@ static void __init apple_airport_reset(int bus, int slot, int func)  	mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);  	if (!mmio) { -		dev_err("Cannot iomap Apple AirPort card\n"); +		pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n", +		       bus, slot, func);  		return;  	} diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 0c408f8c4ed4..8047379e575a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -6,6 +6,10 @@   */  #define DISABLE_BRANCH_PROFILING + +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 +  #include <linux/init.h>  #include <linux/linkage.h>  #include <linux/types.h> @@ -32,11 +36,6 @@  #include <asm/microcode.h>  #include <asm/kasan.h> -#ifdef CONFIG_X86_5LEVEL -#undef pgtable_l5_enabled -#define pgtable_l5_enabled __pgtable_l5_enabled -#endif -  /*   * Manage page tables very early on.   */ @@ -46,7 +45,6 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);  #ifdef CONFIG_X86_5LEVEL  unsigned int __pgtable_l5_enabled __ro_after_init; -EXPORT_SYMBOL(__pgtable_l5_enabled);  unsigned int pgdir_shift __ro_after_init = 39;  EXPORT_SYMBOL(pgdir_shift);  unsigned int ptrs_per_p4d __ro_after_init = 1; @@ -82,13 +80,14 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)  static bool __head check_la57_support(unsigned long physaddr)  { -	if (native_cpuid_eax(0) < 7) -		return false; - -	if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) +	/* +	 * 5-level paging is detected and enabled at kernel decomression +	 * stage. Only check if it has been enabled there. +	 */ +	if (!(native_read_cr4() & X86_CR4_LA57))  		return false; -	*fixup_int(&pgtable_l5_enabled, physaddr) = 1; +	*fixup_int(&__pgtable_l5_enabled, physaddr) = 1;  	*fixup_int(&pgdir_shift, physaddr) = 48;  	*fixup_int(&ptrs_per_p4d, physaddr) = 512;  	*fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; @@ -104,6 +103,12 @@ static bool __head check_la57_support(unsigned long physaddr)  }  #endif +/* Code in __startup_64() can be relocated during execution, but the compiler + * doesn't have to generate PC-relative relocations when accessing globals from + * that function. Clang actually does not generate them, which leads to + * boot-time crashes. To work around this problem, every global pointer must + * be adjusted using fixup_pointer(). + */  unsigned long __head __startup_64(unsigned long physaddr,  				  struct boot_params *bp)  { @@ -113,6 +118,7 @@ unsigned long __head __startup_64(unsigned long physaddr,  	p4dval_t *p4d;  	pudval_t *pud;  	pmdval_t *pmd, pmd_entry; +	pteval_t *mask_ptr;  	bool la57;  	int i;  	unsigned int *next_pgt_ptr; @@ -196,7 +202,8 @@ unsigned long __head __startup_64(unsigned long physaddr,  	pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;  	/* Filter out unsupported __PAGE_KERNEL_* bits: */ -	pmd_entry &= __supported_pte_mask; +	mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr); +	pmd_entry &= *mask_ptr;  	pmd_entry += sme_get_me_mask();  	pmd_entry +=  physaddr; @@ -273,7 +280,7 @@ again:  	 * critical -- __PAGE_OFFSET would point us back into the dynamic  	 * range and we might end up looping forever...  	 */ -	if (!pgtable_l5_enabled) +	if (!pgtable_l5_enabled())  		p4d_p = pgd_p;  	else if (pgd)  		p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b59e4fb40fd9..abe6df15a8fb 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -375,7 +375,7 @@ ENDPROC(startup_32_smp)   */  __INIT  setup_once: -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR  	/*  	 * Configure the stack canary. The linker can't handle this by  	 * relocation.  Manually set base address in stack canary diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 8ce4212e2b8d..346b24883911 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -610,7 +610,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)  	if (!hpet_domain)  		return; -	hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); +	hpet_devs = kcalloc(num_timers, sizeof(struct hpet_dev), GFP_KERNEL);  	if (!hpet_devs)  		return; @@ -966,8 +966,8 @@ int __init hpet_enable(void)  #endif  	cfg = hpet_readl(HPET_CFG); -	hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg), -				GFP_KERNEL); +	hpet_boot_cfg = kmalloc_array(last + 2, sizeof(*hpet_boot_cfg), +				      GFP_KERNEL);  	if (hpet_boot_cfg)  		*hpet_boot_cfg = cfg;  	else @@ -975,8 +975,7 @@ int __init hpet_enable(void)  	cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);  	hpet_writel(cfg, HPET_CFG);  	if (cfg) -		pr_warn("HPET: Unrecognized bits %#x set in global cfg\n", -			cfg); +		pr_warn("Unrecognized bits %#x set in global cfg\n", cfg);  	for (i = 0; i <= last; ++i) {  		cfg = hpet_readl(HPET_Tn_CFG(i)); @@ -988,7 +987,7 @@ int __init hpet_enable(void)  			 | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE  			 | HPET_TN_FSB | HPET_TN_FSB_CAP);  		if (cfg) -			pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n", +			pr_warn("Unrecognized bits %#x set in cfg#%u\n",  				cfg, i);  	}  	hpet_print_config(); diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c index 8eeaa81de066..0a3e70fd00d6 100644 --- a/arch/x86/kernel/i8237.c +++ b/arch/x86/kernel/i8237.c @@ -9,10 +9,12 @@   * your option) any later version.   */ +#include <linux/dmi.h>  #include <linux/init.h>  #include <linux/syscore_ops.h>  #include <asm/dma.h> +#include <asm/x86_init.h>  /*   * This module just handles suspend/resume issues with the @@ -49,6 +51,29 @@ static struct syscore_ops i8237_syscore_ops = {  static int __init i8237A_init_ops(void)  { +	/* +	 * From SKL PCH onwards, the legacy DMA device is removed in which the +	 * I/O ports (81h-83h, 87h, 89h-8Bh, 8Fh) related to it are removed +	 * as well. All removed ports must return 0xff for a inb() request. +	 * +	 * Note: DMA_PAGE_2 (port 0x81) should not be checked for detecting +	 * the presence of DMA device since it may be used by BIOS to decode +	 * LPC traffic for POST codes. Original LPC only decodes one byte of +	 * port 0x80 but some BIOS may choose to enhance PCH LPC port 0x8x +	 * decoding. +	 */ +	if (dma_inb(DMA_PAGE_0) == 0xFF) +		return -ENODEV; + +	/* +	 * It is not required to load this driver as newer SoC may not +	 * support 8237 DMA or bus mastering from LPC. Platform firmware +	 * must announce the support for such legacy devices via +	 * ACPI_FADT_LEGACY_DEVICES field in FADT table. +	 */ +	if (x86_pnpbios_disabled() && dmi_get_bios_year() >= 2017) +		return -ENODEV; +  	register_syscore_ops(&i8237_syscore_ops);  	return 0;  } diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 2c3a1b4294eb..74383a3780dc 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -317,15 +317,12 @@ void __init idt_setup_apic_and_irq_gates(void)  		set_intr_gate(i, entry);  	} -	for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {  #ifdef CONFIG_X86_LOCAL_APIC +	for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {  		set_bit(i, system_vectors);  		set_intr_gate(i, spurious_interrupt); -#else -		entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); -		set_intr_gate(i, entry); -#endif  	} +#endif  }  /** diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index a15fe0e92cf9..108c48d0d40e 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -37,7 +37,7 @@ static uint32_t __init jailhouse_detect(void)  	return jailhouse_cpuid_base();  } -static void jailhouse_get_wallclock(struct timespec *now) +static void jailhouse_get_wallclock(struct timespec64 *now)  {  	memset(now, 0, sizeof(*now));  } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0715f827607c..6f4d42377fe5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -370,6 +370,10 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)  	if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)  		return 0; +	/* We should not singlestep on the exception masking instructions */ +	if (insn_masking_exception(insn)) +		return 0; +  #ifdef CONFIG_X86_64  	/* Only x86_64 has RIP relative instructions */  	if (insn_rip_relative(insn)) { diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 8c1cc08f514f..163ae706a0d4 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -283,7 +283,7 @@ static int __init create_setup_data_nodes(struct kobject *parent)  	if (ret)  		goto out_setup_data_kobj; -	kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL); +	kobjp = kmalloc_array(nr, sizeof(*kobjp), GFP_KERNEL);  	if (!kobjp) {  		ret = -ENOMEM;  		goto out_setup_data_kobj; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 7867417cfaff..5b2300b818af 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -457,7 +457,7 @@ static void __init sev_map_percpu_data(void)  static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)  {  	native_smp_prepare_cpus(max_cpus); -	if (kvm_para_has_hint(KVM_HINTS_DEDICATED)) +	if (kvm_para_has_hint(KVM_HINTS_REALTIME))  		static_branch_disable(&virt_spin_lock_key);  } @@ -553,7 +553,7 @@ static void __init kvm_guest_init(void)  	}  	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && -	    !kvm_para_has_hint(KVM_HINTS_DEDICATED) && +	    !kvm_para_has_hint(KVM_HINTS_REALTIME) &&  	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))  		pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; @@ -649,7 +649,7 @@ static __init int kvm_setup_pv_tlb_flush(void)  	int cpu;  	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && -	    !kvm_para_has_hint(KVM_HINTS_DEDICATED) && +	    !kvm_para_has_hint(KVM_HINTS_REALTIME) &&  	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {  		for_each_possible_cpu(cpu) {  			zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), @@ -745,7 +745,7 @@ void __init kvm_spinlock_init(void)  	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))  		return; -	if (kvm_para_has_hint(KVM_HINTS_DEDICATED)) +	if (kvm_para_has_hint(KVM_HINTS_REALTIME))  		return;  	__pv_init_lock_hash(); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 8b26c9e01cc4..bf8d1eb7fca3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -53,7 +53,7 @@ static struct pvclock_wall_clock *wall_clock;   * have elapsed since the hypervisor wrote the data. So we try to account for   * that with system time   */ -static void kvm_get_wallclock(struct timespec *now) +static void kvm_get_wallclock(struct timespec64 *now)  {  	struct pvclock_vcpu_time_info *vcpu_time;  	int low, high; @@ -72,7 +72,7 @@ static void kvm_get_wallclock(struct timespec *now)  	put_cpu();  } -static int kvm_set_wallclock(const struct timespec *now) +static int kvm_set_wallclock(const struct timespec64 *now)  {  	return -ENODEV;  } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 60cdec6628b0..d1ab07ec8c9a 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -57,12 +57,17 @@ static void load_segments(void)  static void machine_kexec_free_page_tables(struct kimage *image)  {  	free_page((unsigned long)image->arch.pgd); +	image->arch.pgd = NULL;  #ifdef CONFIG_X86_PAE  	free_page((unsigned long)image->arch.pmd0); +	image->arch.pmd0 = NULL;  	free_page((unsigned long)image->arch.pmd1); +	image->arch.pmd1 = NULL;  #endif  	free_page((unsigned long)image->arch.pte0); +	image->arch.pte0 = NULL;  	free_page((unsigned long)image->arch.pte1); +	image->arch.pte1 = NULL;  }  static int machine_kexec_alloc_page_tables(struct kimage *image) @@ -79,7 +84,6 @@ static int machine_kexec_alloc_page_tables(struct kimage *image)  	    !image->arch.pmd0 || !image->arch.pmd1 ||  #endif  	    !image->arch.pte0 || !image->arch.pte1) { -		machine_kexec_free_page_tables(image);  		return -ENOMEM;  	}  	return 0; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a5e55d832d0a..4c8acdfdc5a7 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -39,9 +39,13 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {  static void free_transition_pgtable(struct kimage *image)  {  	free_page((unsigned long)image->arch.p4d); +	image->arch.p4d = NULL;  	free_page((unsigned long)image->arch.pud); +	image->arch.pud = NULL;  	free_page((unsigned long)image->arch.pmd); +	image->arch.pmd = NULL;  	free_page((unsigned long)image->arch.pte); +	image->arch.pte = NULL;  }  static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) @@ -91,7 +95,6 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)  	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));  	return 0;  err: -	free_transition_pgtable(image);  	return result;  } @@ -351,7 +354,8 @@ void arch_crash_save_vmcoreinfo(void)  {  	VMCOREINFO_NUMBER(phys_base);  	VMCOREINFO_SYMBOL(init_top_pgt); -	VMCOREINFO_NUMBER(pgtable_l5_enabled); +	vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n", +			pgtable_l5_enabled());  #ifdef CONFIG_NUMA  	VMCOREINFO_SYMBOL(node_data); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 77625b60a510..ab5d9dd668d2 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -15,13 +15,11 @@  #include <asm/x86_init.h>  #include <asm/iommu_table.h> -static int forbid_dac __read_mostly; +static bool disable_dac_quirk __read_mostly;  const struct dma_map_ops *dma_ops = &dma_direct_ops;  EXPORT_SYMBOL(dma_ops); -static int iommu_sac_force __read_mostly; -  #ifdef CONFIG_IOMMU_DEBUG  int panic_on_overflow __read_mostly = 1;  int force_iommu __read_mostly = 1; @@ -55,9 +53,6 @@ struct device x86_dma_fallback_dev = {  };  EXPORT_SYMBOL(x86_dma_fallback_dev); -/* Number of entries preallocated for DMA-API debugging */ -#define PREALLOC_DMA_DEBUG_ENTRIES       65536 -  void __init pci_iommu_alloc(void)  {  	struct iommu_table_entry *p; @@ -76,7 +71,7 @@ void __init pci_iommu_alloc(void)  	}  } -bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp) +bool arch_dma_alloc_attrs(struct device **dev)  {  	if (!*dev)  		*dev = &x86_dma_fallback_dev; @@ -125,13 +120,13 @@ static __init int iommu_setup(char *p)  		if (!strncmp(p, "nomerge", 7))  			iommu_merge = 0;  		if (!strncmp(p, "forcesac", 8)) -			iommu_sac_force = 1; +			pr_warn("forcesac option ignored.\n");  		if (!strncmp(p, "allowdac", 8)) -			forbid_dac = 0; +			pr_warn("allowdac option ignored.\n");  		if (!strncmp(p, "nodac", 5)) -			forbid_dac = 1; +			pr_warn("nodac option ignored.\n");  		if (!strncmp(p, "usedac", 6)) { -			forbid_dac = -1; +			disable_dac_quirk = true;  			return 1;  		}  #ifdef CONFIG_SWIOTLB @@ -156,40 +151,9 @@ static __init int iommu_setup(char *p)  }  early_param("iommu", iommu_setup); -int arch_dma_supported(struct device *dev, u64 mask) -{ -#ifdef CONFIG_PCI -	if (mask > 0xffffffff && forbid_dac > 0) { -		dev_info(dev, "PCI: Disallowing DAC for device\n"); -		return 0; -	} -#endif - -	/* Tell the device to use SAC when IOMMU force is on.  This -	   allows the driver to use cheaper accesses in some cases. - -	   Problem with this is that if we overflow the IOMMU area and -	   return DAC as fallback address the device may not handle it -	   correctly. - -	   As a special case some controllers have a 39bit address -	   mode that is as efficient as 32bit (aic79xx). Don't force -	   SAC for these.  Assume all masks <= 40 bits are of this -	   type. Normally this doesn't make any difference, but gives -	   more gentle handling of IOMMU overflow. */ -	if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) { -		dev_info(dev, "Force SAC with mask %Lx\n", mask); -		return 0; -	} - -	return 1; -} -EXPORT_SYMBOL(arch_dma_supported); -  static int __init pci_iommu_init(void)  {  	struct iommu_table_entry *p; -	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);  #ifdef CONFIG_PCI  	dma_debug_add_bus(&pci_bus_type); @@ -209,11 +173,17 @@ rootfs_initcall(pci_iommu_init);  #ifdef CONFIG_PCI  /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ +static int via_no_dac_cb(struct pci_dev *pdev, void *data) +{ +	pdev->dev.dma_32bit_limit = true; +	return 0; +} +  static void via_no_dac(struct pci_dev *dev)  { -	if (forbid_dac == 0) { +	if (!disable_dac_quirk) {  		dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); -		forbid_dac = 1; +		pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL);  	}  }  DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index e47b2dbbdef3..c06c4c16c6b6 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -151,17 +151,19 @@ void perf_get_regs_user(struct perf_regs *regs_user,  	regs_user_copy->sp = user_regs->sp;  	regs_user_copy->cs = user_regs->cs;  	regs_user_copy->ss = user_regs->ss; -  	/* -	 * Most system calls don't save these registers, don't report them. +	 * Store user space frame-pointer value on sample +	 * to facilitate stack unwinding for cases when +	 * user space executable code has such support +	 * enabled at compile time:  	 */ +	regs_user_copy->bp = user_regs->bp; +  	regs_user_copy->bx = -1; -	regs_user_copy->bp = -1;  	regs_user_copy->r12 = -1;  	regs_user_copy->r13 = -1;  	regs_user_copy->r14 = -1;  	regs_user_copy->r15 = -1; -  	/*  	 * For this to be at all useful, we need a reasonable guess for  	 * the ABI.  Be careful: we're in NMI context, and we're diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 235fe6008ac8..b348a672f71d 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -33,9 +33,14 @@ void __init x86_early_init_platform_quirks(void)  		x86_platform.set_legacy_features();  } +bool __init x86_pnpbios_disabled(void) +{ +	return x86_platform.legacy.devices.pnpbios == 0; +} +  #if defined(CONFIG_PNPBIOS)  bool __init arch_pnpbios_disabled(void)  { -	return x86_platform.legacy.devices.pnpbios == 0; +	return x86_pnpbios_disabled();  }  #endif diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 03408b942adb..30ca2d1a9231 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -38,6 +38,7 @@  #include <asm/switch_to.h>  #include <asm/desc.h>  #include <asm/prctl.h> +#include <asm/spec-ctrl.h>  /*   * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -278,6 +279,148 @@ static inline void switch_to_bitmap(struct tss_struct *tss,  	}  } +#ifdef CONFIG_SMP + +struct ssb_state { +	struct ssb_state	*shared_state; +	raw_spinlock_t		lock; +	unsigned int		disable_state; +	unsigned long		local_state; +}; + +#define LSTATE_SSB	0 + +static DEFINE_PER_CPU(struct ssb_state, ssb_state); + +void speculative_store_bypass_ht_init(void) +{ +	struct ssb_state *st = this_cpu_ptr(&ssb_state); +	unsigned int this_cpu = smp_processor_id(); +	unsigned int cpu; + +	st->local_state = 0; + +	/* +	 * Shared state setup happens once on the first bringup +	 * of the CPU. It's not destroyed on CPU hotunplug. +	 */ +	if (st->shared_state) +		return; + +	raw_spin_lock_init(&st->lock); + +	/* +	 * Go over HT siblings and check whether one of them has set up the +	 * shared state pointer already. +	 */ +	for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) { +		if (cpu == this_cpu) +			continue; + +		if (!per_cpu(ssb_state, cpu).shared_state) +			continue; + +		/* Link it to the state of the sibling: */ +		st->shared_state = per_cpu(ssb_state, cpu).shared_state; +		return; +	} + +	/* +	 * First HT sibling to come up on the core.  Link shared state of +	 * the first HT sibling to itself. The siblings on the same core +	 * which come up later will see the shared state pointer and link +	 * themself to the state of this CPU. +	 */ +	st->shared_state = st; +} + +/* + * Logic is: First HT sibling enables SSBD for both siblings in the core + * and last sibling to disable it, disables it for the whole core. This how + * MSR_SPEC_CTRL works in "hardware": + * + *  CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL + */ +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ +	struct ssb_state *st = this_cpu_ptr(&ssb_state); +	u64 msr = x86_amd_ls_cfg_base; + +	if (!static_cpu_has(X86_FEATURE_ZEN)) { +		msr |= ssbd_tif_to_amd_ls_cfg(tifn); +		wrmsrl(MSR_AMD64_LS_CFG, msr); +		return; +	} + +	if (tifn & _TIF_SSBD) { +		/* +		 * Since this can race with prctl(), block reentry on the +		 * same CPU. +		 */ +		if (__test_and_set_bit(LSTATE_SSB, &st->local_state)) +			return; + +		msr |= x86_amd_ls_cfg_ssbd_mask; + +		raw_spin_lock(&st->shared_state->lock); +		/* First sibling enables SSBD: */ +		if (!st->shared_state->disable_state) +			wrmsrl(MSR_AMD64_LS_CFG, msr); +		st->shared_state->disable_state++; +		raw_spin_unlock(&st->shared_state->lock); +	} else { +		if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state)) +			return; + +		raw_spin_lock(&st->shared_state->lock); +		st->shared_state->disable_state--; +		if (!st->shared_state->disable_state) +			wrmsrl(MSR_AMD64_LS_CFG, msr); +		raw_spin_unlock(&st->shared_state->lock); +	} +} +#else +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ +	u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); + +	wrmsrl(MSR_AMD64_LS_CFG, msr); +} +#endif + +static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) +{ +	/* +	 * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, +	 * so ssbd_tif_to_spec_ctrl() just works. +	 */ +	wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); +} + +static __always_inline void intel_set_ssb_state(unsigned long tifn) +{ +	u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn); + +	wrmsrl(MSR_IA32_SPEC_CTRL, msr); +} + +static __always_inline void __speculative_store_bypass_update(unsigned long tifn) +{ +	if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) +		amd_set_ssb_virt_state(tifn); +	else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) +		amd_set_core_ssb_state(tifn); +	else +		intel_set_ssb_state(tifn); +} + +void speculative_store_bypass_update(unsigned long tif) +{ +	preempt_disable(); +	__speculative_store_bypass_update(tif); +	preempt_enable(); +} +  void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,  		      struct tss_struct *tss)  { @@ -309,6 +452,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,  	if ((tifp ^ tifn) & _TIF_NOCPUID)  		set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); + +	if ((tifp ^ tifn) & _TIF_SSBD) +		__speculative_store_bypass_update(tifn);  }  /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 5224c6099184..0ae659de21eb 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -76,16 +76,14 @@ void __show_regs(struct pt_regs *regs, int all)  		savesegment(gs, gs);  	} -	printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip); -	printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags, -		raw_smp_processor_id()); +	show_ip(regs, KERN_DEFAULT);  	printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",  		regs->ax, regs->bx, regs->cx, regs->dx);  	printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",  		regs->si, regs->di, regs->bp, sp); -	printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", -	       (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); +	printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", +	       (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags);  	if (!all)  		return; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4b100fe0f508..12bb445fb98d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -542,6 +542,7 @@ void set_personality_64bit(void)  	clear_thread_flag(TIF_X32);  	/* Pretend that this comes from a 64bit execve */  	task_pt_regs(current)->orig_ax = __NR_execve; +	current_thread_info()->status &= ~TS_COMPAT;  	/* Ensure the corresponding mm is not marked. */  	if (current->mm) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index ed5c4cdf0a34..e2ee403865eb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1377,7 +1377,6 @@ static void fill_sigtrap_info(struct task_struct *tsk,  	tsk->thread.trap_nr = X86_TRAP_DB;  	tsk->thread.error_code = error_code; -	memset(info, 0, sizeof(*info));  	info->si_signo = SIGTRAP;  	info->si_code = si_code;  	info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; @@ -1395,6 +1394,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,  {  	struct siginfo info; +	clear_siginfo(&info);  	fill_sigtrap_info(tsk, regs, error_code, si_code, &info);  	/* Send us the fake SIGTRAP */  	force_sig_info(SIGTRAP, &info, tsk); diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 761f6af6efa5..637982efecd8 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -123,28 +123,35 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)  void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,  			    struct pvclock_vcpu_time_info *vcpu_time, -			    struct timespec *ts) +			    struct timespec64 *ts)  {  	u32 version;  	u64 delta; -	struct timespec now; +	struct timespec64 now;  	/* get wallclock at system boot */  	do {  		version = wall_clock->version;  		rmb();		/* fetch version before time */ +		/* +		 * Note: wall_clock->sec is a u32 value, so it can +		 * only store dates between 1970 and 2106. To allow +		 * times beyond that, we need to create a new hypercall +		 * interface with an extended pvclock_wall_clock structure +		 * like ARM has. +		 */  		now.tv_sec  = wall_clock->sec;  		now.tv_nsec = wall_clock->nsec;  		rmb();		/* fetch time before checking version */  	} while ((wall_clock->version & 1) || (version != wall_clock->version));  	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */ -	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; +	delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec;  	now.tv_nsec = do_div(delta, NSEC_PER_SEC);  	now.tv_sec = delta; -	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +	set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);  }  void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 697a4ce04308..736348ead421 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -645,12 +645,19 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)  /* Skylake */  static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)  { -	u32 capid0; +	u32 capid0, capid5;  	pci_read_config_dword(pdev, 0x84, &capid0); +	pci_read_config_dword(pdev, 0x98, &capid5); -	if ((capid0 & 0xc0) == 0xc0) +	/* +	 * CAPID0{7:6} indicate whether this is an advanced RAS SKU +	 * CAPID5{8:5} indicate that various NVDIMM usage modes are +	 * enabled, so memory machine check recovery is also enabled. +	 */ +	if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0))  		static_branch_inc(&mcsafe_key); +  }  DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);  DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index f7b82ed7b5b5..586f718b8e95 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -39,7 +39,7 @@ EXPORT_SYMBOL(rtc_lock);   * jump to the next second precisely 500 ms later. Check the Motorola   * MC146818A or Dallas DS12887 data sheet for details.   */ -int mach_set_rtc_mmss(const struct timespec *now) +int mach_set_rtc_mmss(const struct timespec64 *now)  {  	unsigned long long nowtime = now->tv_sec;  	struct rtc_time tm; @@ -60,7 +60,7 @@ int mach_set_rtc_mmss(const struct timespec *now)  	return retval;  } -void mach_get_cmos_time(struct timespec *now) +void mach_get_cmos_time(struct timespec64 *now)  {  	unsigned int status, year, mon, day, hour, min, sec, century = 0;  	unsigned long flags; @@ -118,7 +118,7 @@ void mach_get_cmos_time(struct timespec *now)  	} else  		year += CMOS_YEARS_OFFS; -	now->tv_sec = mktime(year, mon, day, hour, min, sec); +	now->tv_sec = mktime64(year, mon, day, hour, min, sec);  	now->tv_nsec = 0;  } @@ -145,13 +145,13 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)  }  EXPORT_SYMBOL(rtc_cmos_write); -int update_persistent_clock(struct timespec now) +int update_persistent_clock64(struct timespec64 now)  {  	return x86_platform.set_wallclock(&now);  }  /* not static: needed by APM */ -void read_persistent_clock(struct timespec *ts) +void read_persistent_clock64(struct timespec64 *ts)  {  	x86_platform.get_wallclock(ts);  } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5c623dfe39d1..2f86d883dd95 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1312,11 +1312,3 @@ static int __init register_kernel_offset_dumper(void)  	return 0;  }  __initcall(register_kernel_offset_dumper); - -void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) -{ -	if (!boot_cpu_has(X86_FEATURE_OSPKE)) -		return; - -	seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma)); -} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index da270b95fe4d..92a3b312a53c 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -688,6 +688,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)  	sigset_t *set = sigmask_to_save();  	compat_sigset_t *cset = (compat_sigset_t *) set; +	/* +	 * Increment event counter and perform fixup for the pre-signal +	 * frame. +	 */ +	rseq_signal_deliver(ksig, regs); +  	/* Set up the stack frame */  	if (is_ia32_frame(ksig)) {  		if (ksig->ka.sa.sa_flags & SA_SIGINFO) diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 14c057f29979..9ccbf0576cd0 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)  	BUILD_BUG_ON(NSIGFPE  != 15);  	BUILD_BUG_ON(NSIGSEGV != 7);  	BUILD_BUG_ON(NSIGBUS  != 5); -	BUILD_BUG_ON(NSIGTRAP != 4); +	BUILD_BUG_ON(NSIGTRAP != 5);  	BUILD_BUG_ON(NSIGCHLD != 6);  	BUILD_BUG_ON(NSIGSYS  != 1); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0f1cbb042f49..c2f7d1d2a5c3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -79,13 +79,7 @@  #include <asm/qspinlock.h>  #include <asm/intel-family.h>  #include <asm/cpu_device_id.h> - -/* Number of siblings per CPU package */ -int smp_num_siblings = 1; -EXPORT_SYMBOL(smp_num_siblings); - -/* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; +#include <asm/spec-ctrl.h>  /* representing HT siblings of each logical CPU */  DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -244,6 +238,8 @@ static void notrace start_secondary(void *unused)  	 */  	check_tsc_sync_target(); +	speculative_store_bypass_ht_init(); +  	/*  	 * Lock vector_lock, set CPU online and bring the vector  	 * allocator online. Online must be set with vector_lock held @@ -1292,6 +1288,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)  	set_mtrr_aps_delayed_init();  	smp_quirk_init_udelay(); + +	speculative_store_bypass_ht_init();  }  void arch_enable_nonboot_cpus_begin(void) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index a3f15ed545b5..6a78d4b36a79 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -1,4 +1,5 @@  // SPDX-License-Identifier: GPL-2.0 +#include <linux/compat.h>  #include <linux/errno.h>  #include <linux/sched.h>  #include <linux/sched/mm.h> @@ -19,7 +20,6 @@  #include <linux/elf.h>  #include <asm/elf.h> -#include <asm/compat.h>  #include <asm/ia32.h>  #include <asm/syscalls.h>  #include <asm/mpx.h> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 03f3d7695dac..e6db475164ed 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -299,6 +299,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,  	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=  			NOTIFY_STOP) {  		cond_local_irq_enable(regs); +		clear_siginfo(&info);  		do_trap(trapnr, signr, str, regs, error_code,  			fill_trap_info(regs, signr, trapnr, &info));  	} @@ -834,16 +835,18 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)  	char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :  						"simd exception"; -	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) -		return;  	cond_local_irq_enable(regs);  	if (!user_mode(regs)) { -		if (!fixup_exception(regs, trapnr)) { -			task->thread.error_code = error_code; -			task->thread.trap_nr = trapnr; +		if (fixup_exception(regs, trapnr)) +			return; + +		task->thread.error_code = error_code; +		task->thread.trap_nr = trapnr; + +		if (notify_die(DIE_TRAP, str, regs, error_code, +					trapnr, SIGFPE) != NOTIFY_STOP)  			die(str, regs, error_code); -		}  		return;  	} @@ -854,6 +857,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)  	task->thread.trap_nr	= trapnr;  	task->thread.error_code = error_code; +	clear_siginfo(&info);  	info.si_signo		= SIGFPE;  	info.si_errno		= 0;  	info.si_addr		= (void __user *)uprobe_get_trap_addr(regs); @@ -929,6 +933,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)  	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");  	local_irq_enable(); +	clear_siginfo(&info);  	info.si_signo = SIGILL;  	info.si_errno = 0;  	info.si_code = ILL_BADSTK; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 91e6da48cbb6..74392d9d51e0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1067,6 +1067,7 @@ static struct clocksource clocksource_tsc_early = {  	.resume			= tsc_resume,  	.mark_unstable		= tsc_cs_mark_unstable,  	.tick_stable		= tsc_cs_tick_stable, +	.list			= LIST_HEAD_INIT(clocksource_tsc_early.list),  };  /* @@ -1086,6 +1087,7 @@ static struct clocksource clocksource_tsc = {  	.resume			= tsc_resume,  	.mark_unstable		= tsc_cs_mark_unstable,  	.tick_stable		= tsc_cs_tick_stable, +	.list			= LIST_HEAD_INIT(clocksource_tsc.list),  };  void mark_tsc_unstable(char *reason) @@ -1098,13 +1100,9 @@ void mark_tsc_unstable(char *reason)  		clear_sched_clock_stable();  	disable_sched_clock_irqtime();  	pr_info("Marking TSC unstable due to %s\n", reason); -	/* Change only the rating, when not registered */ -	if (clocksource_tsc.mult) { -		clocksource_mark_unstable(&clocksource_tsc); -	} else { -		clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; -		clocksource_tsc.rating = 0; -	} + +	clocksource_mark_unstable(&clocksource_tsc_early); +	clocksource_mark_unstable(&clocksource_tsc);  }  EXPORT_SYMBOL_GPL(mark_tsc_unstable); @@ -1244,7 +1242,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)  	/* Don't bother refining TSC on unstable systems */  	if (tsc_unstable) -		return; +		goto unreg;  	/*  	 * Since the work is started early in boot, we may be @@ -1297,11 +1295,12 @@ static void tsc_refine_calibration_work(struct work_struct *work)  out:  	if (tsc_unstable) -		return; +		goto unreg;  	if (boot_cpu_has(X86_FEATURE_ART))  		art_related_clocksource = &clocksource_tsc;  	clocksource_register_khz(&clocksource_tsc, tsc_khz); +unreg:  	clocksource_unregister(&clocksource_tsc_early);  } @@ -1311,8 +1310,8 @@ static int __init init_tsc_clocksource(void)  	if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz)  		return 0; -	if (check_tsc_unstable()) -		return 0; +	if (tsc_unstable) +		goto unreg;  	if (tsc_clocksource_reliable)  		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; @@ -1328,6 +1327,7 @@ static int __init init_tsc_clocksource(void)  		if (boot_cpu_has(X86_FEATURE_ART))  			art_related_clocksource = &clocksource_tsc;  		clocksource_register_khz(&clocksource_tsc, tsc_khz); +unreg:  		clocksource_unregister(&clocksource_tsc_early);  		return 0;  	} diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index f44ce0fb3583..ff20b35e98dd 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -278,6 +278,7 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)  	tsk->thread.error_code	= X86_PF_USER | X86_PF_WRITE;  	tsk->thread.trap_nr	= X86_TRAP_PF; +	clear_siginfo(&info);  	info.si_signo	= SIGSEGV;  	info.si_errno	= 0;  	info.si_code	= SEGV_MAPERR; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 85c7ef23d99f..deb576b23b7c 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -293,12 +293,16 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool  	insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64);  	/* has the side-effect of processing the entire instruction */  	insn_get_length(insn); -	if (WARN_ON_ONCE(!insn_complete(insn))) +	if (!insn_complete(insn))  		return -ENOEXEC;  	if (is_prefix_bad(insn))  		return -ENOTSUPP; +	/* We should not singlestep on the exception masking instructions */ +	if (insn_masking_exception(insn)) +		return -ENOTSUPP; +  	if (x86_64)  		good_insns = good_insns_64;  	else @@ -1079,8 +1083,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs  		return orig_ret_vaddr;  	if (nleft != rasize) { -		pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " -			"%%ip=%#lx\n", current->pid, regs->sp, regs->ip); +		pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", +		       current->pid, regs->sp, regs->ip);  		force_sig_info(SIGSEGV, SEND_SIG_FORCED, current);  	} diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 795f3a80e576..5e1458f609a1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -117,11 +117,11 @@ SECTIONS  #ifdef CONFIG_X86_64  		. = ALIGN(PAGE_SIZE); -		VMLINUX_SYMBOL(__entry_trampoline_start) = .; +		__entry_trampoline_start = .;  		_entry_trampoline = .;  		*(.entry_trampoline)  		. = ALIGN(PAGE_SIZE); -		VMLINUX_SYMBOL(__entry_trampoline_end) = .; +		__entry_trampoline_end = .;  		ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");  #endif |