diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
32 files changed, 1398 insertions, 307 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 9bff68798836..4eb065c6bed2 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \  					   perf_event_intel_uncore_snb.o \  					   perf_event_intel_uncore_snbep.o \  					   perf_event_intel_uncore_nhmex.o +obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o +obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o  endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dd3a4baffe50..4a70fc6d400a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -11,6 +11,7 @@  #include <asm/cpu.h>  #include <asm/smp.h>  #include <asm/pci-direct.h> +#include <asm/delay.h>  #ifdef CONFIG_X86_64  # include <asm/mmconfig.h> @@ -114,7 +115,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)  		const int K6_BUG_LOOP = 1000000;  		int n;  		void (*f_vide)(void); -		unsigned long d, d2; +		u64 d, d2;  		printk(KERN_INFO "AMD K6 stepping B detected - "); @@ -125,10 +126,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)  		n = K6_BUG_LOOP;  		f_vide = vide; -		rdtscl(d); +		d = rdtsc();  		while (n--)  			f_vide(); -		rdtscl(d2); +		d2 = rdtsc();  		d = d2-d;  		if (d > 20*K6_BUG_LOOP) @@ -506,6 +507,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)  		/* A random value per boot for bit slice [12:upper_bit) */  		va_align.bits = get_random_int() & va_align.mask;  	} + +	if (cpu_has(c, X86_FEATURE_MWAITX)) +		use_mwaitx_delay();  }  static void early_init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 922c5e0cea4c..07ce52c22ec8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@  #include <linux/kgdb.h>  #include <linux/smp.h>  #include <linux/io.h> +#include <linux/syscore_ops.h>  #include <asm/stackprotector.h>  #include <asm/perf_event.h> @@ -1185,10 +1186,10 @@ void syscall_init(void)  	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.  	 */  	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); -	wrmsrl(MSR_LSTAR, entry_SYSCALL_64); +	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);  #ifdef CONFIG_IA32_EMULATION -	wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); +	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);  	/*  	 * This only works on Intel CPUs.  	 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. @@ -1199,7 +1200,7 @@ void syscall_init(void)  	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);  	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);  #else -	wrmsrl(MSR_CSTAR, ignore_sysret); +	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);  	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);  	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);  	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); @@ -1410,7 +1411,7 @@ void cpu_init(void)  	load_sp0(t, ¤t->thread);  	set_tss_desc(cpu, t);  	load_TR_desc(); -	load_LDT(&init_mm.context); +	load_mm_ldt(&init_mm);  	clear_all_debug_regs();  	dbg_restore_debug_regs(); @@ -1459,7 +1460,7 @@ void cpu_init(void)  	load_sp0(t, thread);  	set_tss_desc(cpu, t);  	load_TR_desc(); -	load_LDT(&init_mm.context); +	load_mm_ldt(&init_mm);  	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); @@ -1488,3 +1489,20 @@ inline bool __static_cpu_has_safe(u16 bit)  	return boot_cpu_has(bit);  }  EXPORT_SYMBOL_GPL(__static_cpu_has_safe); + +static void bsp_resume(void) +{ +	if (this_cpu->c_bsp_resume) +		this_cpu->c_bsp_resume(&boot_cpu_data); +} + +static struct syscore_ops cpu_syscore_ops = { +	.resume		= bsp_resume, +}; + +static int __init init_cpu_syscore(void) +{ +	register_syscore_ops(&cpu_syscore_ops); +	return 0; +} +core_initcall(init_cpu_syscore); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index c37dc37e8317..2584265d4745 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -13,6 +13,7 @@ struct cpu_dev {  	void		(*c_init)(struct cpuinfo_x86 *);  	void		(*c_identify)(struct cpuinfo_x86 *);  	void		(*c_detect_tlb)(struct cpuinfo_x86 *); +	void		(*c_bsp_resume)(struct cpuinfo_x86 *);  	int		c_x86_vendor;  #ifdef CONFIG_X86_32  	/* Optional vendor specific routine to obtain the cache size. */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 50163fa9034f..98a13db5f4be 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -371,6 +371,36 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)  	}  } +static void init_intel_energy_perf(struct cpuinfo_x86 *c) +{ +	u64 epb; + +	/* +	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized. +	 * (x86_energy_perf_policy(8) is available to change it at run-time.) +	 */ +	if (!cpu_has(c, X86_FEATURE_EPB)) +		return; + +	rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); +	if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) +		return; + +	pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); +	pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); +	epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; +	wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); +} + +static void intel_bsp_resume(struct cpuinfo_x86 *c) +{ +	/* +	 * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume, +	 * so reinitialize it properly like during bootup: +	 */ +	init_intel_energy_perf(c); +} +  static void init_intel(struct cpuinfo_x86 *c)  {  	unsigned int l2 = 0; @@ -478,21 +508,7 @@ static void init_intel(struct cpuinfo_x86 *c)  	if (cpu_has(c, X86_FEATURE_VMX))  		detect_vmx_virtcap(c); -	/* -	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not. -	 * x86_energy_perf_policy(8) is available to change it at run-time -	 */ -	if (cpu_has(c, X86_FEATURE_EPB)) { -		u64 epb; - -		rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); -		if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) { -			pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); -			pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); -			epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; -			wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); -		} -	} +	init_intel_energy_perf(c);  }  #ifdef CONFIG_X86_32 @@ -747,6 +763,7 @@ static const struct cpu_dev intel_cpu_dev = {  	.c_detect_tlb	= intel_detect_tlb,  	.c_early_init   = early_init_intel,  	.c_init		= init_intel, +	.c_bsp_resume	= intel_bsp_resume,  	.c_x86_vendor	= X86_VENDOR_INTEL,  }; diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h index 1c338b0eba05..336878a5d205 100644 --- a/arch/x86/kernel/cpu/intel_pt.h +++ b/arch/x86/kernel/cpu/intel_pt.h @@ -25,32 +25,11 @@   */  #define TOPA_PMI_MARGIN 512 -/* - * Table of Physical Addresses bits - */ -enum topa_sz { -	TOPA_4K	= 0, -	TOPA_8K, -	TOPA_16K, -	TOPA_32K, -	TOPA_64K, -	TOPA_128K, -	TOPA_256K, -	TOPA_512K, -	TOPA_1MB, -	TOPA_2MB, -	TOPA_4MB, -	TOPA_8MB, -	TOPA_16MB, -	TOPA_32MB, -	TOPA_64MB, -	TOPA_128MB, -	TOPA_SZ_END, -}; +#define TOPA_SHIFT 12 -static inline unsigned int sizes(enum topa_sz tsz) +static inline unsigned int sizes(unsigned int tsz)  { -	return 1 << (tsz + 12); +	return 1 << (tsz + TOPA_SHIFT);  };  struct topa_entry { @@ -66,20 +45,26 @@ struct topa_entry {  	u64	rsvd4	: 16;  }; -#define TOPA_SHIFT 12 -#define PT_CPUID_LEAVES 2 +#define PT_CPUID_LEAVES		2 +#define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */  enum pt_capabilities {  	PT_CAP_max_subleaf = 0,  	PT_CAP_cr3_filtering, +	PT_CAP_psb_cyc, +	PT_CAP_mtc,  	PT_CAP_topa_output,  	PT_CAP_topa_multiple_entries, +	PT_CAP_single_range_output,  	PT_CAP_payloads_lip, +	PT_CAP_mtc_periods, +	PT_CAP_cycle_thresholds, +	PT_CAP_psb_periods,  };  struct pt_pmu {  	struct pmu		pmu; -	u32			caps[4 * PT_CPUID_LEAVES]; +	u32			caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];  };  /** diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index bb34b03af252..a3311c886194 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,4 +1,4 @@ -obj-y				=  mce.o mce-severity.o +obj-y				=  mce.o mce-severity.o mce-genpool.o  obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o  obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index a1aef9533154..34c89a3e8260 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -57,7 +57,6 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)  	m.addr = mem_err->physical_addr;  	mce_log(&m); -	mce_notify_irq();  }  EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c new file mode 100644 index 000000000000..0a850100c594 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -0,0 +1,99 @@ +/* + * MCE event pool management in MCE context + * + * Copyright (C) 2015 Intel Corp. + * Author: Chen, Gong <[email protected]> + * + * This file is licensed under GPLv2. + */ +#include <linux/smp.h> +#include <linux/mm.h> +#include <linux/genalloc.h> +#include <linux/llist.h> +#include "mce-internal.h" + +/* + * printk() is not safe in MCE context. This is a lock-less memory allocator + * used to save error information organized in a lock-less list. + * + * This memory pool is only to be used to save MCE records in MCE context. + * MCE events are rare, so a fixed size memory pool should be enough. Use + * 2 pages to save MCE events for now (~80 MCE records at most). + */ +#define MCE_POOLSZ	(2 * PAGE_SIZE) + +static struct gen_pool *mce_evt_pool; +static LLIST_HEAD(mce_event_llist); +static char gen_pool_buf[MCE_POOLSZ]; + +void mce_gen_pool_process(void) +{ +	struct llist_node *head; +	struct mce_evt_llist *node; +	struct mce *mce; + +	head = llist_del_all(&mce_event_llist); +	if (!head) +		return; + +	head = llist_reverse_order(head); +	llist_for_each_entry(node, head, llnode) { +		mce = &node->mce; +		atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); +		gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); +	} +} + +bool mce_gen_pool_empty(void) +{ +	return llist_empty(&mce_event_llist); +} + +int mce_gen_pool_add(struct mce *mce) +{ +	struct mce_evt_llist *node; + +	if (!mce_evt_pool) +		return -EINVAL; + +	node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); +	if (!node) { +		pr_warn_ratelimited("MCE records pool full!\n"); +		return -ENOMEM; +	} + +	memcpy(&node->mce, mce, sizeof(*mce)); +	llist_add(&node->llnode, &mce_event_llist); + +	return 0; +} + +static int mce_gen_pool_create(void) +{ +	struct gen_pool *tmpp; +	int ret = -ENOMEM; + +	tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); +	if (!tmpp) +		goto out; + +	ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); +	if (ret) { +		gen_pool_destroy(tmpp); +		goto out; +	} + +	mce_evt_pool = tmpp; + +out: +	return ret; +} + +int mce_gen_pool_init(void) +{ +	/* Just init mce_gen_pool once. */ +	if (mce_evt_pool) +		return 0; + +	return mce_gen_pool_create(); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fe32074b865b..547720efd923 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -13,6 +13,8 @@ enum severity_level {  	MCE_PANIC_SEVERITY,  }; +extern struct atomic_notifier_head x86_mce_decoder_chain; +  #define ATTR_LEN		16  #define INITIAL_CHECK_INTERVAL	5 * 60 /* 5 minutes */ @@ -24,6 +26,16 @@ struct mce_bank {  	char			attrname[ATTR_LEN];	/* attribute name */  }; +struct mce_evt_llist { +	struct llist_node llnode; +	struct mce mce; +}; + +void mce_gen_pool_process(void); +bool mce_gen_pool_empty(void); +int mce_gen_pool_add(struct mce *mce); +int mce_gen_pool_init(void); +  extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);  struct dentry *mce_get_debugfs_dir(void); @@ -67,3 +79,5 @@ static inline int apei_clear_mce(u64 record_id)  	return -EINVAL;  }  #endif + +void mce_inject_log(struct mce *m); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index df919ff103c3..9d014b82a124 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -52,11 +52,11 @@  static DEFINE_MUTEX(mce_chrdev_read_mutex); -#define rcu_dereference_check_mce(p) \ +#define mce_log_get_idx_check(p) \  ({ \ -	rcu_lockdep_assert(rcu_read_lock_sched_held() || \ -			   lockdep_is_held(&mce_chrdev_read_mutex), \ -			   "suspicious rcu_dereference_check_mce() usage"); \ +	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ +			 !lockdep_is_held(&mce_chrdev_read_mutex), \ +			 "suspicious mce_log_get_idx_check() usage"); \  	smp_load_acquire(&(p)); \  }) @@ -110,22 +110,24 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {   */  mce_banks_t mce_banks_ce_disabled; -static DEFINE_PER_CPU(struct work_struct, mce_work); +static struct work_struct mce_work; +static struct irq_work mce_irq_work;  static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); +static int mce_usable_address(struct mce *m);  /*   * CPU/chipset specific EDAC code can register a notifier call here to print   * MCE errors in a human-readable form.   */ -static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);  /* Do initial initialization of a struct mce */  void mce_setup(struct mce *m)  {  	memset(m, 0, sizeof(struct mce));  	m->cpu = m->extcpu = smp_processor_id(); -	rdtscll(m->tsc); +	m->tsc = rdtsc();  	/* We hope get_seconds stays lockless */  	m->time = get_seconds();  	m->cpuvendor = boot_cpu_data.x86_vendor; @@ -157,12 +159,13 @@ void mce_log(struct mce *mce)  	/* Emit the trace record: */  	trace_mce_record(mce); -	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); +	if (!mce_gen_pool_add(mce)) +		irq_work_queue(&mce_irq_work);  	mce->finished = 0;  	wmb();  	for (;;) { -		entry = rcu_dereference_check_mce(mcelog.next); +		entry = mce_log_get_idx_check(mcelog.next);  		for (;;) {  			/* @@ -196,48 +199,23 @@ void mce_log(struct mce *mce)  	set_bit(0, &mce_need_notify);  } -static void drain_mcelog_buffer(void) +void mce_inject_log(struct mce *m)  { -	unsigned int next, i, prev = 0; - -	next = ACCESS_ONCE(mcelog.next); - -	do { -		struct mce *m; - -		/* drain what was logged during boot */ -		for (i = prev; i < next; i++) { -			unsigned long start = jiffies; -			unsigned retries = 1; - -			m = &mcelog.entry[i]; - -			while (!m->finished) { -				if (time_after_eq(jiffies, start + 2*retries)) -					retries++; - -				cpu_relax(); - -				if (!m->finished && retries >= 4) { -					pr_err("skipping error being logged currently!\n"); -					break; -				} -			} -			smp_rmb(); -			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); -		} - -		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); -		prev = next; -		next = cmpxchg(&mcelog.next, prev, 0); -	} while (next != prev); +	mutex_lock(&mce_chrdev_read_mutex); +	mce_log(m); +	mutex_unlock(&mce_chrdev_read_mutex);  } +EXPORT_SYMBOL_GPL(mce_inject_log); +static struct notifier_block mce_srao_nb;  void mce_register_decode_chain(struct notifier_block *nb)  { +	/* Ensure SRAO notifier has the highest priority in the decode chain. */ +	if (nb != &mce_srao_nb && nb->priority == INT_MAX) +		nb->priority -= 1; +  	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); -	drain_mcelog_buffer();  }  EXPORT_SYMBOL_GPL(mce_register_decode_chain); @@ -461,61 +439,6 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)  	}  } -/* - * Simple lockless ring to communicate PFNs from the exception handler with the - * process context work function. This is vastly simplified because there's - * only a single reader and a single writer. - */ -#define MCE_RING_SIZE 16	/* we use one entry less */ - -struct mce_ring { -	unsigned short start; -	unsigned short end; -	unsigned long ring[MCE_RING_SIZE]; -}; -static DEFINE_PER_CPU(struct mce_ring, mce_ring); - -/* Runs with CPU affinity in workqueue */ -static int mce_ring_empty(void) -{ -	struct mce_ring *r = this_cpu_ptr(&mce_ring); - -	return r->start == r->end; -} - -static int mce_ring_get(unsigned long *pfn) -{ -	struct mce_ring *r; -	int ret = 0; - -	*pfn = 0; -	get_cpu(); -	r = this_cpu_ptr(&mce_ring); -	if (r->start == r->end) -		goto out; -	*pfn = r->ring[r->start]; -	r->start = (r->start + 1) % MCE_RING_SIZE; -	ret = 1; -out: -	put_cpu(); -	return ret; -} - -/* Always runs in MCE context with preempt off */ -static int mce_ring_add(unsigned long pfn) -{ -	struct mce_ring *r = this_cpu_ptr(&mce_ring); -	unsigned next; - -	next = (r->end + 1) % MCE_RING_SIZE; -	if (next == r->start) -		return -1; -	r->ring[r->end] = pfn; -	wmb(); -	r->end = next; -	return 0; -} -  int mce_available(struct cpuinfo_x86 *c)  {  	if (mca_cfg.disabled) @@ -525,12 +448,10 @@ int mce_available(struct cpuinfo_x86 *c)  static void mce_schedule_work(void)  { -	if (!mce_ring_empty()) -		schedule_work(this_cpu_ptr(&mce_work)); +	if (!mce_gen_pool_empty() && keventd_up()) +		schedule_work(&mce_work);  } -static DEFINE_PER_CPU(struct irq_work, mce_irq_work); -  static void mce_irq_work_cb(struct irq_work *entry)  {  	mce_notify_irq(); @@ -551,8 +472,29 @@ static void mce_report_event(struct pt_regs *regs)  		return;  	} -	irq_work_queue(this_cpu_ptr(&mce_irq_work)); +	irq_work_queue(&mce_irq_work); +} + +static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, +				void *data) +{ +	struct mce *mce = (struct mce *)data; +	unsigned long pfn; + +	if (!mce) +		return NOTIFY_DONE; + +	if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) { +		pfn = mce->addr >> PAGE_SHIFT; +		memory_failure(pfn, MCE_VECTOR, 0); +	} + +	return NOTIFY_OK;  } +static struct notifier_block mce_srao_nb = { +	.notifier_call	= srao_decode_notifier, +	.priority = INT_MAX, +};  /*   * Read ADDR and MISC registers. @@ -672,8 +614,11 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)  		 */  		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {  			if (m.status & MCI_STATUS_ADDRV) { -				mce_ring_add(m.addr >> PAGE_SHIFT); -				mce_schedule_work(); +				m.severity = severity; +				m.usable_addr = mce_usable_address(&m); + +				if (!mce_gen_pool_add(&m)) +					mce_schedule_work();  			}  		} @@ -1029,7 +974,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)  {  	struct mca_config *cfg = &mca_cfg;  	struct mce m, *final; -	enum ctx_state prev_state;  	int i;  	int worst = 0;  	int severity; @@ -1055,7 +999,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)  	int flags = MF_ACTION_REQUIRED;  	int lmce = 0; -	prev_state = ist_enter(regs); +	ist_enter(regs);  	this_cpu_inc(mce_exception_count); @@ -1143,15 +1087,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)  		mce_read_aux(&m, i); -		/* -		 * Action optional error. Queue address for later processing. -		 * When the ring overflows we just ignore the AO error. -		 * RED-PEN add some logging mechanism when -		 * usable_address or mce_add_ring fails. -		 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 -		 */ -		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) -			mce_ring_add(m.addr >> PAGE_SHIFT); +		/* assuming valid severity level != 0 */ +		m.severity = severity; +		m.usable_addr = mce_usable_address(&m);  		mce_log(&m); @@ -1227,7 +1165,7 @@ out:  	local_irq_disable();  	ist_end_non_atomic();  done: -	ist_exit(regs, prev_state); +	ist_exit(regs);  }  EXPORT_SYMBOL_GPL(do_machine_check); @@ -1247,14 +1185,11 @@ int memory_failure(unsigned long pfn, int vector, int flags)  /*   * Action optional processing happens here (picking up   * from the list of faulting pages that do_machine_check() - * placed into the "ring"). + * placed into the genpool).   */  static void mce_process_work(struct work_struct *dummy)  { -	unsigned long pfn; - -	while (mce_ring_get(&pfn)) -		memory_failure(pfn, MCE_VECTOR, 0); +	mce_gen_pool_process();  }  #ifdef CONFIG_X86_MCE_INTEL @@ -1678,6 +1613,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)  	}  } +static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) +{ +	switch (c->x86_vendor) { +	case X86_VENDOR_INTEL: +		mce_intel_feature_clear(c); +		break; +	default: +		break; +	} +} +  static void mce_start_timer(unsigned int cpu, struct timer_list *t)  {  	unsigned long iv = check_interval * HZ; @@ -1731,13 +1677,36 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)  		return;  	} +	if (mce_gen_pool_init()) { +		mca_cfg.disabled = true; +		pr_emerg("Couldn't allocate MCE records pool!\n"); +		return; +	} +  	machine_check_vector = do_machine_check;  	__mcheck_cpu_init_generic();  	__mcheck_cpu_init_vendor(c);  	__mcheck_cpu_init_timer(); -	INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work); -	init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb); +} + +/* + * Called for each booted CPU to clear some machine checks opt-ins + */ +void mcheck_cpu_clear(struct cpuinfo_x86 *c) +{ +	if (mca_cfg.disabled) +		return; + +	if (!mce_available(c)) +		return; + +	/* +	 * Possibly to clear general settings generic to x86 +	 * __mcheck_cpu_clear_generic(c); +	 */ +	__mcheck_cpu_clear_vendor(c); +  }  /* @@ -1784,7 +1753,7 @@ static void collect_tscs(void *data)  {  	unsigned long *cpu_tsc = (unsigned long *)data; -	rdtscll(cpu_tsc[smp_processor_id()]); +	cpu_tsc[smp_processor_id()] = rdtsc();  }  static int mce_apei_read_done; @@ -1850,7 +1819,7 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,  			goto out;  	} -	next = rcu_dereference_check_mce(mcelog.next); +	next = mce_log_get_idx_check(mcelog.next);  	/* Only supports full reads right now */  	err = -EINVAL; @@ -2056,8 +2025,12 @@ __setup("mce", mcheck_enable);  int __init mcheck_init(void)  {  	mcheck_intel_therm_init(); +	mce_register_decode_chain(&mce_srao_nb);  	mcheck_vendor_init_severity(); +	INIT_WORK(&mce_work, mce_process_work); +	init_irq_work(&mce_irq_work, mce_irq_work_cb); +  	return 0;  } @@ -2591,5 +2564,20 @@ static int __init mcheck_debugfs_init(void)  	return 0;  } -late_initcall(mcheck_debugfs_init); +#else +static int __init mcheck_debugfs_init(void) { return -EINVAL; }  #endif + +static int __init mcheck_late_init(void) +{ +	mcheck_debugfs_init(); + +	/* +	 * Flush out everything that has been logged during early boot, now that +	 * everything has been initialized (workqueues, decoders, ...). +	 */ +	mce_schedule_work(); + +	return 0; +} +late_initcall(mcheck_late_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 844f56c5616d..1e8bb6c94f14 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -146,6 +146,27 @@ void mce_intel_hcpu_update(unsigned long cpu)  	per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;  } +static void cmci_toggle_interrupt_mode(bool on) +{ +	unsigned long flags, *owned; +	int bank; +	u64 val; + +	raw_spin_lock_irqsave(&cmci_discover_lock, flags); +	owned = this_cpu_ptr(mce_banks_owned); +	for_each_set_bit(bank, owned, MAX_NR_BANKS) { +		rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + +		if (on) +			val |= MCI_CTL2_CMCI_EN; +		else +			val &= ~MCI_CTL2_CMCI_EN; + +		wrmsrl(MSR_IA32_MCx_CTL2(bank), val); +	} +	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} +  unsigned long cmci_intel_adjust_timer(unsigned long interval)  {  	if ((this_cpu_read(cmci_backoff_cnt) > 0) && @@ -175,7 +196,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)  		 */  		if (!atomic_read(&cmci_storm_on_cpus)) {  			__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); -			cmci_reenable(); +			cmci_toggle_interrupt_mode(true);  			cmci_recheck();  		}  		return CMCI_POLL_INTERVAL; @@ -186,22 +207,6 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)  	}  } -static void cmci_storm_disable_banks(void) -{ -	unsigned long flags, *owned; -	int bank; -	u64 val; - -	raw_spin_lock_irqsave(&cmci_discover_lock, flags); -	owned = this_cpu_ptr(mce_banks_owned); -	for_each_set_bit(bank, owned, MAX_NR_BANKS) { -		rdmsrl(MSR_IA32_MCx_CTL2(bank), val); -		val &= ~MCI_CTL2_CMCI_EN; -		wrmsrl(MSR_IA32_MCx_CTL2(bank), val); -	} -	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); -} -  static bool cmci_storm_detect(void)  {  	unsigned int cnt = __this_cpu_read(cmci_storm_cnt); @@ -223,7 +228,7 @@ static bool cmci_storm_detect(void)  	if (cnt <= CMCI_STORM_THRESHOLD)  		return false; -	cmci_storm_disable_banks(); +	cmci_toggle_interrupt_mode(false);  	__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);  	r = atomic_add_return(1, &cmci_storm_on_cpus);  	mce_timer_kick(CMCI_STORM_INTERVAL); @@ -246,7 +251,6 @@ static void intel_threshold_interrupt(void)  		return;  	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); -	mce_notify_irq();  }  /* @@ -435,7 +439,7 @@ static void intel_init_cmci(void)  	cmci_recheck();  } -void intel_init_lmce(void) +static void intel_init_lmce(void)  {  	u64 val; @@ -448,9 +452,26 @@ void intel_init_lmce(void)  		wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);  } +static void intel_clear_lmce(void) +{ +	u64 val; + +	if (!lmce_supported()) +		return; + +	rdmsrl(MSR_IA32_MCG_EXT_CTL, val); +	val &= ~MCG_EXT_CTL_LMCE_EN; +	wrmsrl(MSR_IA32_MCG_EXT_CTL, val); +} +  void mce_intel_feature_init(struct cpuinfo_x86 *c)  {  	intel_init_thermal(c);  	intel_init_cmci();  	intel_init_lmce();  } + +void mce_intel_feature_clear(struct cpuinfo_x86 *c) +{ +	intel_clear_lmce(); +} diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 737b0ad4e61a..12402e10aeff 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly;  /* Machine check handler for Pentium class Intel CPUs: */  static void pentium_machine_check(struct pt_regs *regs, long error_code)  { -	enum ctx_state prev_state;  	u32 loaddr, hi, lotype; -	prev_state = ist_enter(regs); +	ist_enter(regs);  	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);  	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)  	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); -	ist_exit(regs, prev_state); +	ist_exit(regs);  }  /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 44f138296fbe..01dd8702880b 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -15,12 +15,12 @@  /* Machine check handler for WinChip C6: */  static void winchip_machine_check(struct pt_regs *regs, long error_code)  { -	enum ctx_state prev_state = ist_enter(regs); +	ist_enter(regs);  	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");  	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); -	ist_exit(regs, prev_state); +	ist_exit(regs);  }  /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 6236a54a63f4..9e3f3c7dd5d7 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -377,17 +377,16 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)  	return err;  } -static int mc_device_remove(struct device *dev, struct subsys_interface *sif) +static void mc_device_remove(struct device *dev, struct subsys_interface *sif)  {  	int cpu = dev->id;  	if (!cpu_online(cpu)) -		return 0; +		return;  	pr_debug("CPU%d removed\n", cpu);  	microcode_fini_cpu(cpu);  	sysfs_remove_group(&dev->kobj, &mc_attr_group); -	return 0;  }  static struct subsys_interface mc_cpu_interface = { @@ -460,7 +459,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)  	return NOTIFY_OK;  } -static struct notifier_block __refdata mc_cpu_notifier = { +static struct notifier_block mc_cpu_notifier = {  	.notifier_call	= mc_cpu_callback,  }; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 8187b7247d1c..37ea89c11520 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -390,7 +390,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)  }  #ifdef DEBUG -static void __ref show_saved_mc(void) +static void show_saved_mc(void)  {  	int i, j;  	unsigned int sig, pf, rev, total_size, data_size, date; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index aad4bd84b475..381c8b9b3a33 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,6 +18,7 @@  #include <linux/efi.h>  #include <linux/interrupt.h>  #include <linux/irq.h> +#include <linux/kexec.h>  #include <asm/processor.h>  #include <asm/hypervisor.h>  #include <asm/hyperv.h> @@ -28,10 +29,14 @@  #include <asm/i8259.h>  #include <asm/apic.h>  #include <asm/timer.h> +#include <asm/reboot.h>  struct ms_hyperv_info ms_hyperv;  EXPORT_SYMBOL_GPL(ms_hyperv); +static void (*hv_kexec_handler)(void); +static void (*hv_crash_handler)(struct pt_regs *regs); +  #if IS_ENABLED(CONFIG_HYPERV)  static void (*vmbus_handler)(void); @@ -67,8 +72,47 @@ void hv_remove_vmbus_irq(void)  }  EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);  EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); + +void hv_setup_kexec_handler(void (*handler)(void)) +{ +	hv_kexec_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_kexec_handler); + +void hv_remove_kexec_handler(void) +{ +	hv_kexec_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_kexec_handler); + +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)) +{ +	hv_crash_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_crash_handler); + +void hv_remove_crash_handler(void) +{ +	hv_crash_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_crash_handler);  #endif +static void hv_machine_shutdown(void) +{ +	if (kexec_in_progress && hv_kexec_handler) +		hv_kexec_handler(); +	native_machine_shutdown(); +} + +static void hv_machine_crash_shutdown(struct pt_regs *regs) +{ +	if (hv_crash_handler) +		hv_crash_handler(regs); +	native_machine_crash_shutdown(regs); +} + +  static uint32_t  __init ms_hyperv_platform(void)  {  	u32 eax; @@ -114,6 +158,7 @@ static void __init ms_hyperv_init_platform(void)  	 * Extract the features and hints  	 */  	ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); +	ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);  	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);  	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", @@ -141,6 +186,9 @@ static void __init ms_hyperv_init_platform(void)  	no_timer_check = 1;  #endif +	machine_ops.shutdown = hv_machine_shutdown; +	machine_ops.crash_shutdown = hv_machine_crash_shutdown; +	mark_tsc_unstable("running on Hyper-V");  }  const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index e7ed0d8ebacb..f891b4750f04 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -448,7 +448,6 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type,  	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,  			     increment);  } -EXPORT_SYMBOL(mtrr_add);  /**   * mtrr_del_page - delete a memory type region @@ -537,7 +536,6 @@ int mtrr_del(int reg, unsigned long base, unsigned long size)  		return -EINVAL;  	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);  } -EXPORT_SYMBOL(mtrr_del);  /**   * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3658de47900f..66dd3fe99b82 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1551,7 +1551,7 @@ static void __init filter_events(struct attribute **attrs)  }  /* Merge two pointer arrays */ -static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) +__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)  {  	struct attribute **new;  	int j, i; @@ -2179,24 +2179,32 @@ static unsigned long get_segment_base(unsigned int segment)  	int idx = segment >> 3;  	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL +		struct ldt_struct *ldt; +  		if (idx > LDT_ENTRIES)  			return 0; -		if (idx > current->active_mm->context.size) +		/* IRQs are off, so this synchronizes with smp_store_release */ +		ldt = lockless_dereference(current->active_mm->context.ldt); +		if (!ldt || idx > ldt->size)  			return 0; -		desc = current->active_mm->context.ldt; +		desc = &ldt->entries[idx]; +#else +		return 0; +#endif  	} else {  		if (idx > GDT_ENTRIES)  			return 0; -		desc = raw_cpu_ptr(gdt_page.gdt); +		desc = raw_cpu_ptr(gdt_page.gdt) + idx;  	} -	return get_desc_base(desc + idx); +	return get_desc_base(desc);  } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_IA32_EMULATION  #include <asm/compat.h> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 3e7fd27dfe20..5edf6d868fc1 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -165,7 +165,7 @@ struct intel_excl_cntrs {  	unsigned	core_id;	/* per-core: core id */  }; -#define MAX_LBR_ENTRIES		16 +#define MAX_LBR_ENTRIES		32  enum {  	X86_PERF_KFREE_SHARED = 0, @@ -594,6 +594,7 @@ struct x86_pmu {  	struct event_constraint *pebs_constraints;  	void		(*pebs_aliases)(struct perf_event *event);  	int 		max_pebs_events; +	unsigned long	free_running_flags;  	/*  	 * Intel LBR @@ -624,6 +625,7 @@ struct x86_pmu {  struct x86_perf_task_context {  	u64 lbr_from[MAX_LBR_ENTRIES];  	u64 lbr_to[MAX_LBR_ENTRIES]; +	u64 lbr_info[MAX_LBR_ENTRIES];  	int lbr_callstack_users;  	int lbr_stack_state;  }; @@ -793,6 +795,8 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)  ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);  ssize_t intel_event_sysfs_show(char *page, u64 config); +struct attribute **merge_attr(struct attribute **a, struct attribute **b); +  #ifdef CONFIG_CPU_SUP_AMD  int amd_pmu_init(void); @@ -808,20 +812,6 @@ static inline int amd_pmu_init(void)  #ifdef CONFIG_CPU_SUP_INTEL -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) -{ -	/* user explicitly requested branch sampling */ -	if (has_branch_stack(event)) -		return true; - -	/* implicit branch sampling to correct PEBS skid */ -	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && -	    x86_pmu.intel_cap.pebs_format < 2) -		return true; - -	return false; -} -  static inline bool intel_pmu_has_bts(struct perf_event *event)  {  	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && @@ -873,6 +863,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];  extern struct event_constraint intel_hsw_pebs_event_constraints[]; +extern struct event_constraint intel_skl_pebs_event_constraints[]; +  struct event_constraint *intel_pebs_constraints(struct perf_event *event);  void intel_pmu_pebs_enable(struct perf_event *event); @@ -911,6 +903,8 @@ void intel_pmu_lbr_init_snb(void);  void intel_pmu_lbr_init_hsw(void); +void intel_pmu_lbr_init_skl(void); +  int intel_pmu_setup_lbr_filter(struct perf_event *event);  void intel_pt_interrupt(void); @@ -934,6 +928,7 @@ static inline int is_ht_workaround_enabled(void)  {  	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);  } +  #else /* CONFIG_CPU_SUP_INTEL */  static inline void reserve_ds_buffers(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index b9826a981fb2..cd9b6d0b10bf 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -12,7 +12,7 @@  #include <linux/init.h>  #include <linux/slab.h>  #include <linux/export.h> -#include <linux/watchdog.h> +#include <linux/nmi.h>  #include <asm/cpufeature.h>  #include <asm/hardirq.h> @@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =  	EVENT_CONSTRAINT_END  }; +struct event_constraint intel_skl_event_constraints[] = { +	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */ +	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */ +	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */ +	INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */ +	EVENT_CONSTRAINT_END +}; +  static struct extra_reg intel_snb_extra_regs[] __read_mostly = {  	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */  	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), @@ -193,6 +201,13 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {  	EVENT_EXTRA_END  }; +static struct extra_reg intel_skl_extra_regs[] __read_mostly = { +	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), +	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), +	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), +	EVENT_EXTRA_END +}; +  EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3");  EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");  EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2"); @@ -244,6 +259,200 @@ static u64 intel_pmu_event_map(int hw_event)  	return intel_perfmon_event_map[hw_event];  } +/* + * Notes on the events: + * - data reads do not include code reads (comparable to earlier tables) + * - data counts include speculative execution (except L1 write, dtlb, bpu) + * - remote node access includes remote memory, remote cache, remote mmio. + * - prefetches are not included in the counts. + * - icache miss does not include decoded icache + */ + +#define SKL_DEMAND_DATA_RD		BIT_ULL(0) +#define SKL_DEMAND_RFO			BIT_ULL(1) +#define SKL_ANY_RESPONSE		BIT_ULL(16) +#define SKL_SUPPLIER_NONE		BIT_ULL(17) +#define SKL_L3_MISS_LOCAL_DRAM		BIT_ULL(26) +#define SKL_L3_MISS_REMOTE_HOP0_DRAM	BIT_ULL(27) +#define SKL_L3_MISS_REMOTE_HOP1_DRAM	BIT_ULL(28) +#define SKL_L3_MISS_REMOTE_HOP2P_DRAM	BIT_ULL(29) +#define SKL_L3_MISS			(SKL_L3_MISS_LOCAL_DRAM| \ +					 SKL_L3_MISS_REMOTE_HOP0_DRAM| \ +					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \ +					 SKL_L3_MISS_REMOTE_HOP2P_DRAM) +#define SKL_SPL_HIT			BIT_ULL(30) +#define SKL_SNOOP_NONE			BIT_ULL(31) +#define SKL_SNOOP_NOT_NEEDED		BIT_ULL(32) +#define SKL_SNOOP_MISS			BIT_ULL(33) +#define SKL_SNOOP_HIT_NO_FWD		BIT_ULL(34) +#define SKL_SNOOP_HIT_WITH_FWD		BIT_ULL(35) +#define SKL_SNOOP_HITM			BIT_ULL(36) +#define SKL_SNOOP_NON_DRAM		BIT_ULL(37) +#define SKL_ANY_SNOOP			(SKL_SPL_HIT|SKL_SNOOP_NONE| \ +					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \ +					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \ +					 SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM) +#define SKL_DEMAND_READ			SKL_DEMAND_DATA_RD +#define SKL_SNOOP_DRAM			(SKL_SNOOP_NONE| \ +					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \ +					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \ +					 SKL_SNOOP_HITM|SKL_SPL_HIT) +#define SKL_DEMAND_WRITE		SKL_DEMAND_RFO +#define SKL_LLC_ACCESS			SKL_ANY_RESPONSE +#define SKL_L3_MISS_REMOTE		(SKL_L3_MISS_REMOTE_HOP0_DRAM| \ +					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \ +					 SKL_L3_MISS_REMOTE_HOP2P_DRAM) + +static __initconst const u64 skl_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */ +		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */ +		[ C(RESULT_MISS)   ] = 0x0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x283,	/* ICACHE_64B.MISS */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */ +		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */ +		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */ +		[ C(RESULT_MISS)   ] = 0x608,	/* DTLB_LOAD_MISSES.WALK_COMPLETED */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */ +		[ C(RESULT_MISS)   ] = 0x649,	/* DTLB_STORE_MISSES.WALK_COMPLETED */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x2085,	/* ITLB_MISSES.STLB_HIT */ +		[ C(RESULT_MISS)   ] = 0xe85,	/* ITLB_MISSES.WALK_COMPLETED */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */ +		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(NODE) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */ +		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */ +		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, +}; + +static __initconst const u64 skl_hw_cache_extra_regs +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ| +				       SKL_LLC_ACCESS|SKL_ANY_SNOOP, +		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ| +				       SKL_L3_MISS|SKL_ANY_SNOOP| +				       SKL_SUPPLIER_NONE, +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE| +				       SKL_LLC_ACCESS|SKL_ANY_SNOOP, +		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE| +				       SKL_L3_MISS|SKL_ANY_SNOOP| +				       SKL_SUPPLIER_NONE, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, + [ C(NODE) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ| +				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM, +		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ| +				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM, +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE| +				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM, +		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE| +				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, +}; +  #define SNB_DMND_DATA_RD	(1ULL << 0)  #define SNB_DMND_RFO		(1ULL << 1)  #define SNB_DMND_IFETCH		(1ULL << 2) @@ -1114,7 +1323,7 @@ static struct extra_reg intel_slm_extra_regs[] __read_mostly =  {  	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */  	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0), -	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1), +	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),  	EVENT_EXTRA_END  }; @@ -1594,6 +1803,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)  	loops = 0;  again: +	intel_pmu_lbr_read();  	intel_pmu_ack_status(status);  	if (++loops > 100) {  		static bool warned = false; @@ -1608,16 +1818,16 @@ again:  	inc_irq_stat(apic_perf_irqs); -	intel_pmu_lbr_read();  	/* -	 * CondChgd bit 63 doesn't mean any overflow status. Ignore -	 * and clear the bit. +	 * Ignore a range of extra bits in status that do not indicate +	 * overflow by themselves.  	 */ -	if (__test_and_clear_bit(63, (unsigned long *)&status)) { -		if (!status) -			goto done; -	} +	status &= ~(GLOBAL_STATUS_COND_CHG | +		    GLOBAL_STATUS_ASIF | +		    GLOBAL_STATUS_LBRS_FROZEN); +	if (!status) +		goto done;  	/*  	 * PEBS overflow sets bit 62 in the global status register @@ -1699,18 +1909,22 @@ intel_bts_constraints(struct perf_event *event)  	return NULL;  } -static int intel_alt_er(int idx) +static int intel_alt_er(int idx, u64 config)  { +	int alt_idx;  	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))  		return idx;  	if (idx == EXTRA_REG_RSP_0) -		return EXTRA_REG_RSP_1; +		alt_idx = EXTRA_REG_RSP_1;  	if (idx == EXTRA_REG_RSP_1) -		return EXTRA_REG_RSP_0; +		alt_idx = EXTRA_REG_RSP_0; -	return idx; +	if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask) +		return idx; + +	return alt_idx;  }  static void intel_fixup_er(struct perf_event *event, int idx) @@ -1799,7 +2013,7 @@ again:  		 */  		c = NULL;  	} else { -		idx = intel_alt_er(idx); +		idx = intel_alt_er(idx, reg->config);  		if (idx != reg->idx) {  			raw_spin_unlock_irqrestore(&era->lock, flags);  			goto again; @@ -2253,6 +2467,15 @@ static void intel_pebs_aliases_snb(struct perf_event *event)  	}  } +static unsigned long intel_pmu_free_running_flags(struct perf_event *event) +{ +	unsigned long flags = x86_pmu.free_running_flags; + +	if (event->attr.use_clockid) +		flags &= ~PERF_SAMPLE_TIME; +	return flags; +} +  static int intel_pmu_hw_config(struct perf_event *event)  {  	int ret = x86_pmu_hw_config(event); @@ -2263,7 +2486,8 @@ static int intel_pmu_hw_config(struct perf_event *event)  	if (event->attr.precise_ip) {  		if (!event->attr.freq) {  			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; -			if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS)) +			if (!(event->attr.sample_type & +			      ~intel_pmu_free_running_flags(event)))  				event->hw.flags |= PERF_X86_EVENT_FREERUNNING;  		}  		if (x86_pmu.pebs_aliases) @@ -2534,7 +2758,7 @@ static int intel_pmu_cpu_prepare(int cpu)  	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {  		cpuc->shared_regs = allocate_shared_regs(cpu);  		if (!cpuc->shared_regs) -			return NOTIFY_BAD; +			goto err;  	}  	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { @@ -2542,18 +2766,27 @@ static int intel_pmu_cpu_prepare(int cpu)  		cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);  		if (!cpuc->constraint_list) -			return NOTIFY_BAD; +			goto err_shared_regs;  		cpuc->excl_cntrs = allocate_excl_cntrs(cpu); -		if (!cpuc->excl_cntrs) { -			kfree(cpuc->constraint_list); -			kfree(cpuc->shared_regs); -			return NOTIFY_BAD; -		} +		if (!cpuc->excl_cntrs) +			goto err_constraint_list; +  		cpuc->excl_thread_id = 0;  	}  	return NOTIFY_OK; + +err_constraint_list: +	kfree(cpuc->constraint_list); +	cpuc->constraint_list = NULL; + +err_shared_regs: +	kfree(cpuc->shared_regs); +	cpuc->shared_regs = NULL; + +err: +	return NOTIFY_BAD;  }  static void intel_pmu_cpu_starting(int cpu) @@ -2685,6 +2918,8 @@ static __initconst const struct x86_pmu core_pmu = {  	.event_map		= intel_pmu_event_map,  	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),  	.apic			= 1, +	.free_running_flags	= PEBS_FREERUNNING_FLAGS, +  	/*  	 * Intel PMCs cannot be accessed sanely above 32-bit width,  	 * so we install an artificial 1<<31 period regardless of @@ -2723,6 +2958,7 @@ static __initconst const struct x86_pmu intel_pmu = {  	.event_map		= intel_pmu_event_map,  	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),  	.apic			= 1, +	.free_running_flags	= PEBS_FREERUNNING_FLAGS,  	/*  	 * Intel PMCs cannot be accessed sanely above 32 bit width,  	 * so we install an artificial 1<<31 period regardless of @@ -3260,6 +3496,29 @@ __init int intel_pmu_init(void)  		pr_cont("Broadwell events, ");  		break; +	case 78: /* 14nm Skylake Mobile */ +	case 94: /* 14nm Skylake Desktop */ +		x86_pmu.late_ack = true; +		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); +		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); +		intel_pmu_lbr_init_skl(); + +		x86_pmu.event_constraints = intel_skl_event_constraints; +		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints; +		x86_pmu.extra_regs = intel_skl_extra_regs; +		x86_pmu.pebs_aliases = intel_pebs_aliases_snb; +		/* all extra regs are per-cpu when HT is on */ +		x86_pmu.flags |= PMU_FL_HAS_RSP_1; +		x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + +		x86_pmu.hw_config = hsw_hw_config; +		x86_pmu.get_event_constraints = hsw_get_event_constraints; +		x86_pmu.cpu_events = hsw_events_attrs; +		WARN_ON(!x86_pmu.format_attrs); +		x86_pmu.cpu_events = hsw_events_attrs; +		pr_cont("Skylake events, "); +		break; +  	default:  		switch (x86_pmu.version) {  		case 1: @@ -3329,7 +3588,7 @@ __init int intel_pmu_init(void)  	 */  	if (x86_pmu.extra_regs) {  		for (er = x86_pmu.extra_regs; er->msr; er++) { -			er->extra_msr_access = check_msr(er->msr, 0x1ffUL); +			er->extra_msr_access = check_msr(er->msr, 0x11UL);  			/* Disable LBR select mapping */  			if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)  				x86_pmu.lbr_sel_map = NULL; @@ -3368,7 +3627,10 @@ static __init int fixup_ht_bug(void)  		return 0;  	} -	watchdog_nmi_disable_all(); +	if (lockup_detector_suspend() != 0) { +		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); +		return 0; +	}  	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); @@ -3376,7 +3638,7 @@ static __init int fixup_ht_bug(void)  	x86_pmu.commit_scheduling = NULL;  	x86_pmu.stop_scheduling = NULL; -	watchdog_nmi_enable_all(); +	lockup_detector_resume();  	get_online_cpus(); diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c index 43dd672d788b..54690e885759 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -62,9 +62,6 @@ struct bts_buffer {  struct pmu bts_pmu; -void intel_pmu_enable_bts(u64 config); -void intel_pmu_disable_bts(void); -  static size_t buf_size(struct page *page)  {  	return 1 << (PAGE_SHIFT + page_private(page)); diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 188076161c1b..377e8f8ed391 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -952,6 +952,14 @@ static u64 intel_cqm_event_count(struct perf_event *event)  		return 0;  	/* +	 * Getting up-to-date values requires an SMP IPI which is not +	 * possible if we're being called in interrupt context. Return +	 * the cached values instead. +	 */ +	if (unlikely(in_interrupt())) +		goto out; + +	/*  	 * Notice that we don't perform the reading of an RMID  	 * atomically, because we can't hold a spin lock across the  	 * IPIs. @@ -1247,7 +1255,7 @@ static inline void cqm_pick_event_reader(int cpu)  	cpumask_set_cpu(cpu, &cqm_cpumask);  } -static void intel_cqm_cpu_prepare(unsigned int cpu) +static void intel_cqm_cpu_starting(unsigned int cpu)  {  	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);  	struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -1288,13 +1296,11 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,  	unsigned int cpu  = (unsigned long)hcpu;  	switch (action & ~CPU_TASKS_FROZEN) { -	case CPU_UP_PREPARE: -		intel_cqm_cpu_prepare(cpu); -		break;  	case CPU_DOWN_PREPARE:  		intel_cqm_cpu_exit(cpu);  		break;  	case CPU_STARTING: +		intel_cqm_cpu_starting(cpu);  		cqm_pick_event_reader(cpu);  		break;  	} @@ -1365,7 +1371,7 @@ static int __init intel_cqm_init(void)  		goto out;  	for_each_online_cpu(i) { -		intel_cqm_cpu_prepare(i); +		intel_cqm_cpu_starting(i);  		cqm_pick_event_reader(i);  	} diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 71fc40238843..84f236ab96b0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -224,6 +224,19 @@ union hsw_tsx_tuning {  #define PEBS_HSW_TSX_FLAGS	0xff00000000ULL +/* Same as HSW, plus TSC */ + +struct pebs_record_skl { +	u64 flags, ip; +	u64 ax, bx, cx, dx; +	u64 si, di, bp, sp; +	u64 r8,  r9,  r10, r11; +	u64 r12, r13, r14, r15; +	u64 status, dla, dse, lat; +	u64 real_ip, tsx_tuning; +	u64 tsc; +}; +  void init_debug_store_on_cpu(int cpu)  {  	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -675,6 +688,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {  	EVENT_CONSTRAINT_END  }; +struct event_constraint intel_skl_pebs_event_constraints[] = { +	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */ +	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ +	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ +	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), +	INTEL_PLD_CONSTRAINT(0x1cd, 0xf),		      /* MEM_TRANS_RETIRED.* */ +	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ +	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ +	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ +	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */ +	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ +	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ +	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ +	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ +	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */ +	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */ +	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */ +	/* Allow all events as PEBS with no flags */ +	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), +	EVENT_CONSTRAINT_END +}; +  struct event_constraint *intel_pebs_constraints(struct perf_event *event)  {  	struct event_constraint *c; @@ -754,6 +789,11 @@ void intel_pmu_pebs_disable(struct perf_event *event)  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);  	struct hw_perf_event *hwc = &event->hw;  	struct debug_store *ds = cpuc->ds; +	bool large_pebs = ds->pebs_interrupt_threshold > +		ds->pebs_buffer_base + x86_pmu.pebs_record_size; + +	if (large_pebs) +		intel_pmu_drain_pebs_buffer();  	cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -762,12 +802,8 @@ void intel_pmu_pebs_disable(struct perf_event *event)  	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)  		cpuc->pebs_enabled &= ~(1ULL << 63); -	if (ds->pebs_interrupt_threshold > -	    ds->pebs_buffer_base + x86_pmu.pebs_record_size) { -		intel_pmu_drain_pebs_buffer(); -		if (!pebs_is_enabled(cpuc)) -			perf_sched_cb_dec(event->ctx->pmu); -	} +	if (large_pebs && !pebs_is_enabled(cpuc)) +		perf_sched_cb_dec(event->ctx->pmu);  	if (cpuc->enabled)  		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); @@ -885,7 +921,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  	return 0;  } -static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) +static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)  {  	if (pebs->tsx_tuning) {  		union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; @@ -894,7 +930,7 @@ static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)  	return 0;  } -static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) +static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)  {  	u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; @@ -918,7 +954,7 @@ static void setup_pebs_sample_data(struct perf_event *event,  	 * unconditionally access the 'extra' entries.  	 */  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); -	struct pebs_record_hsw *pebs = __pebs; +	struct pebs_record_skl *pebs = __pebs;  	u64 sample_type;  	int fll, fst, dsrc;  	int fl = event->hw.flags; @@ -1016,6 +1052,16 @@ static void setup_pebs_sample_data(struct perf_event *event,  			data->txn = intel_hsw_transaction(pebs);  	} +	/* +	 * v3 supplies an accurate time stamp, so we use that +	 * for the time stamp. +	 * +	 * We can only do this for the default trace clock. +	 */ +	if (x86_pmu.intel_cap.pebs_format >= 3 && +		event->attr.use_clockid == 0) +		data->time = native_sched_clock_from_tsc(pebs->tsc); +  	if (has_branch_stack(event))  		data->br_stack = &cpuc->lbr_stack;  } @@ -1142,6 +1188,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)  	for (at = base; at < top; at += x86_pmu.pebs_record_size) {  		struct pebs_record_nhm *p = at; +		u64 pebs_status;  		/* PEBS v3 has accurate status bits */  		if (x86_pmu.intel_cap.pebs_format >= 3) { @@ -1152,12 +1199,17 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)  			continue;  		} -		bit = find_first_bit((unsigned long *)&p->status, +		pebs_status = p->status & cpuc->pebs_enabled; +		pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; + +		bit = find_first_bit((unsigned long *)&pebs_status,  					x86_pmu.max_pebs_events); -		if (bit >= x86_pmu.max_pebs_events) -			continue; -		if (!test_bit(bit, cpuc->active_mask)) +		if (WARN(bit >= x86_pmu.max_pebs_events, +			 "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx", +			 (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled, +			 *(unsigned long long *)cpuc->active_mask))  			continue; +  		/*  		 * The PEBS hardware does not deal well with the situation  		 * when events happen near to each other and multiple bits @@ -1172,27 +1224,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)  		 * one, and it's not possible to reconstruct all events  		 * that caused the PEBS record. It's called collision.  		 * If collision happened, the record will be dropped. -		 *  		 */ -		if (p->status != (1 << bit)) { -			u64 pebs_status; - -			/* slow path */ -			pebs_status = p->status & cpuc->pebs_enabled; -			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; -			if (pebs_status != (1 << bit)) { -				for_each_set_bit(i, (unsigned long *)&pebs_status, -						 MAX_PEBS_EVENTS) -					error[i]++; -				continue; -			} +		if (p->status != (1ULL << bit)) { +			for_each_set_bit(i, (unsigned long *)&pebs_status, +					 x86_pmu.max_pebs_events) +				error[i]++; +			continue;  		} +  		counts[bit]++;  	}  	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {  		if ((counts[bit] == 0) && (error[bit] == 0))  			continue; +  		event = cpuc->events[bit];  		WARN_ON_ONCE(!event);  		WARN_ON_ONCE(!event->attr.precise_ip); @@ -1245,6 +1291,14 @@ void __init intel_ds_init(void)  			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;  			break; +		case 3: +			pr_cont("PEBS fmt3%c, ", pebs_type); +			x86_pmu.pebs_record_size = +						sizeof(struct pebs_record_skl); +			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; +			x86_pmu.free_running_flags |= PERF_SAMPLE_TIME; +			break; +  		default:  			printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);  			x86_pmu.pebs = 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 452a7bd2dedb..b2c9475b7ff2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -13,7 +13,8 @@ enum {  	LBR_FORMAT_EIP		= 0x02,  	LBR_FORMAT_EIP_FLAGS	= 0x03,  	LBR_FORMAT_EIP_FLAGS2	= 0x04, -	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_EIP_FLAGS2, +	LBR_FORMAT_INFO		= 0x05, +	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,  };  static enum { @@ -140,6 +141,13 @@ static void __intel_pmu_lbr_enable(bool pmi)  	u64 debugctl, lbr_select = 0, orig_debugctl;  	/* +	 * No need to unfreeze manually, as v4 can do that as part +	 * of the GLOBAL_STATUS ack. +	 */ +	if (pmi && x86_pmu.version >= 4) +		return; + +	/*  	 * No need to reprogram LBR_SELECT in a PMI, as it  	 * did not change.  	 */ @@ -186,6 +194,8 @@ static void intel_pmu_lbr_reset_64(void)  	for (i = 0; i < x86_pmu.lbr_nr; i++) {  		wrmsrl(x86_pmu.lbr_from + i, 0);  		wrmsrl(x86_pmu.lbr_to   + i, 0); +		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) +			wrmsrl(MSR_LBR_INFO_0 + i, 0);  	}  } @@ -230,10 +240,12 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)  	mask = x86_pmu.lbr_nr - 1;  	tos = intel_pmu_lbr_tos(); -	for (i = 0; i < x86_pmu.lbr_nr; i++) { +	for (i = 0; i < tos; i++) {  		lbr_idx = (tos - i) & mask;  		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);  		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); +		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) +			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);  	}  	task_ctx->lbr_stack_state = LBR_NONE;  } @@ -251,10 +263,12 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)  	mask = x86_pmu.lbr_nr - 1;  	tos = intel_pmu_lbr_tos(); -	for (i = 0; i < x86_pmu.lbr_nr; i++) { +	for (i = 0; i < tos; i++) {  		lbr_idx = (tos - i) & mask;  		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);  		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); +		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) +			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);  	}  	task_ctx->lbr_stack_state = LBR_VALID;  } @@ -411,16 +425,31 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)  	u64 tos = intel_pmu_lbr_tos();  	int i;  	int out = 0; +	int num = x86_pmu.lbr_nr; -	for (i = 0; i < x86_pmu.lbr_nr; i++) { +	if (cpuc->lbr_sel->config & LBR_CALL_STACK) +		num = tos; + +	for (i = 0; i < num; i++) {  		unsigned long lbr_idx = (tos - i) & mask;  		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;  		int skip = 0; +		u16 cycles = 0;  		int lbr_flags = lbr_desc[lbr_format];  		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);  		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to); +		if (lbr_format == LBR_FORMAT_INFO) { +			u64 info; + +			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); +			mis = !!(info & LBR_INFO_MISPRED); +			pred = !mis; +			in_tx = !!(info & LBR_INFO_IN_TX); +			abort = !!(info & LBR_INFO_ABORT); +			cycles = (info & LBR_INFO_CYCLES); +		}  		if (lbr_flags & LBR_EIP_FLAGS) {  			mis = !!(from & LBR_FROM_FLAG_MISPRED);  			pred = !mis; @@ -450,6 +479,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)  		cpuc->lbr_entries[out].predicted = pred;  		cpuc->lbr_entries[out].in_tx	 = in_tx;  		cpuc->lbr_entries[out].abort	 = abort; +		cpuc->lbr_entries[out].cycles	 = cycles;  		cpuc->lbr_entries[out].reserved	 = 0;  		out++;  	} @@ -947,6 +977,26 @@ void intel_pmu_lbr_init_hsw(void)  	pr_cont("16-deep LBR, ");  } +/* skylake */ +__init void intel_pmu_lbr_init_skl(void) +{ +	x86_pmu.lbr_nr	 = 32; +	x86_pmu.lbr_tos	 = MSR_LBR_TOS; +	x86_pmu.lbr_from = MSR_LBR_NHM_FROM; +	x86_pmu.lbr_to   = MSR_LBR_NHM_TO; + +	x86_pmu.lbr_sel_mask = LBR_SEL_MASK; +	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map; + +	/* +	 * SW branch filter usage: +	 * - support syscall, sysret capture. +	 *   That requires LBR_FAR but that means far +	 *   jmp need to be filtered out +	 */ +	pr_cont("32-deep LBR, "); +} +  /* atom */  void __init intel_pmu_lbr_init_atom(void)  { diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 183de719628d..42169283448b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -65,15 +65,21 @@ static struct pt_cap_desc {  } pt_caps[] = {  	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),  	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)), +	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)), +	PT_CAP(mtc,			0, CR_EBX, BIT(3)),  	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),  	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)), +	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),  	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)), +	PT_CAP(mtc_periods,		1, CR_EAX, 0xffff0000), +	PT_CAP(cycle_thresholds,	1, CR_EBX, 0xffff), +	PT_CAP(psb_periods,		1, CR_EBX, 0xffff0000),  };  static u32 pt_cap_get(enum pt_capabilities cap)  {  	struct pt_cap_desc *cd = &pt_caps[cap]; -	u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg]; +	u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];  	unsigned int shift = __ffs(cd->mask);  	return (c & cd->mask) >> shift; @@ -94,12 +100,22 @@ static struct attribute_group pt_cap_group = {  	.name	= "caps",  }; +PMU_FORMAT_ATTR(cyc,		"config:1"	); +PMU_FORMAT_ATTR(mtc,		"config:9"	);  PMU_FORMAT_ATTR(tsc,		"config:10"	);  PMU_FORMAT_ATTR(noretcomp,	"config:11"	); +PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	); +PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	); +PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);  static struct attribute *pt_formats_attr[] = { +	&format_attr_cyc.attr, +	&format_attr_mtc.attr,  	&format_attr_tsc.attr,  	&format_attr_noretcomp.attr, +	&format_attr_mtc_period.attr, +	&format_attr_cyc_thresh.attr, +	&format_attr_psb_period.attr,  	NULL,  }; @@ -129,10 +145,10 @@ static int __init pt_pmu_hw_init(void)  	for (i = 0; i < PT_CPUID_LEAVES; i++) {  		cpuid_count(20, i, -			    &pt_pmu.caps[CR_EAX + i*4], -			    &pt_pmu.caps[CR_EBX + i*4], -			    &pt_pmu.caps[CR_ECX + i*4], -			    &pt_pmu.caps[CR_EDX + i*4]); +			    &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM], +			    &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM], +			    &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM], +			    &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);  	}  	ret = -ENOMEM; @@ -170,15 +186,65 @@ fail:  	return ret;  } -#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC) +#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC	| \ +			  RTIT_CTL_CYC_THRESH	| \ +			  RTIT_CTL_PSB_FREQ) + +#define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \ +			 RTIT_CTL_MTC_RANGE) + +#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \ +			RTIT_CTL_DISRETC	| \ +			RTIT_CTL_CYC_PSB	| \ +			RTIT_CTL_MTC)  static bool pt_event_valid(struct perf_event *event)  {  	u64 config = event->attr.config; +	u64 allowed, requested;  	if ((config & PT_CONFIG_MASK) != config)  		return false; +	if (config & RTIT_CTL_CYC_PSB) { +		if (!pt_cap_get(PT_CAP_psb_cyc)) +			return false; + +		allowed = pt_cap_get(PT_CAP_psb_periods); +		requested = (config & RTIT_CTL_PSB_FREQ) >> +			RTIT_CTL_PSB_FREQ_OFFSET; +		if (requested && (!(allowed & BIT(requested)))) +			return false; + +		allowed = pt_cap_get(PT_CAP_cycle_thresholds); +		requested = (config & RTIT_CTL_CYC_THRESH) >> +			RTIT_CTL_CYC_THRESH_OFFSET; +		if (requested && (!(allowed & BIT(requested)))) +			return false; +	} + +	if (config & RTIT_CTL_MTC) { +		/* +		 * In the unlikely case that CPUID lists valid mtc periods, +		 * but not the mtc capability, drop out here. +		 * +		 * Spec says that setting mtc period bits while mtc bit in +		 * CPUID is 0 will #GP, so better safe than sorry. +		 */ +		if (!pt_cap_get(PT_CAP_mtc)) +			return false; + +		allowed = pt_cap_get(PT_CAP_mtc_periods); +		if (!allowed) +			return false; + +		requested = (config & RTIT_CTL_MTC_RANGE) >> +			RTIT_CTL_MTC_RANGE_OFFSET; + +		if (!(allowed & BIT(requested))) +			return false; +	} +  	return true;  } @@ -191,6 +257,11 @@ static void pt_config(struct perf_event *event)  {  	u64 reg; +	if (!event->hw.itrace_started) { +		event->hw.itrace_started = 1; +		wrmsrl(MSR_IA32_RTIT_STATUS, 0); +	} +  	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;  	if (!event->attr.exclude_kernel) @@ -910,7 +981,6 @@ void intel_pt_interrupt(void)  		pt_config_buffer(buf->cur->table, buf->cur_idx,  				 buf->output_off); -		wrmsrl(MSR_IA32_RTIT_STATUS, 0);  		pt_config(event);  	}  } @@ -934,7 +1004,6 @@ static void pt_event_start(struct perf_event *event, int mode)  	pt_config_buffer(buf->cur->table, buf->cur_idx,  			 buf->output_off); -	wrmsrl(MSR_IA32_RTIT_STATUS, 0);  	pt_config(event);  } diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 5cbd4e64feb5..81431c0f0614 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -86,6 +86,10 @@ static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {  			 1<<RAPL_IDX_RAM_NRG_STAT|\  			 1<<RAPL_IDX_PP1_NRG_STAT) +/* Knights Landing has PKG, RAM */ +#define RAPL_IDX_KNL	(1<<RAPL_IDX_PKG_NRG_STAT|\ +			 1<<RAPL_IDX_RAM_NRG_STAT) +  /*   * event code: LSB 8 bits, passed in attr->config   * any other bit is reserved @@ -486,6 +490,18 @@ static struct attribute *rapl_events_hsw_attr[] = {  	NULL,  }; +static struct attribute *rapl_events_knl_attr[] = { +	EVENT_PTR(rapl_pkg), +	EVENT_PTR(rapl_ram), + +	EVENT_PTR(rapl_pkg_unit), +	EVENT_PTR(rapl_ram_unit), + +	EVENT_PTR(rapl_pkg_scale), +	EVENT_PTR(rapl_ram_scale), +	NULL, +}; +  static struct attribute_group rapl_pmu_events_group = {  	.name = "events",  	.attrs = NULL, /* patched at runtime */ @@ -730,6 +746,10 @@ static int __init rapl_pmu_init(void)  		rapl_cntr_mask = RAPL_IDX_SRV;  		rapl_pmu_events_group.attrs = rapl_events_srv_attr;  		break; +	case 87: /* Knights Landing */ +		rapl_add_quirk(rapl_hsw_server_quirk); +		rapl_cntr_mask = RAPL_IDX_KNL; +		rapl_pmu_events_group.attrs = rapl_events_knl_attr;  	default:  		/* unsupported */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 21b5e38c921b..560e5255b15e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -911,6 +911,9 @@ static int __init uncore_pci_init(void)  	case 63: /* Haswell-EP */  		ret = hswep_uncore_pci_init();  		break; +	case 86: /* BDX-DE */ +		ret = bdx_uncore_pci_init(); +		break;  	case 42: /* Sandy Bridge */  		ret = snb_uncore_pci_init();  		break; @@ -1209,6 +1212,11 @@ static int __init uncore_cpu_init(void)  		break;  	case 42: /* Sandy Bridge */  	case 58: /* Ivy Bridge */ +	case 60: /* Haswell */ +	case 69: /* Haswell */ +	case 70: /* Haswell */ +	case 61: /* Broadwell */ +	case 71: /* Broadwell */  		snb_uncore_cpu_init();  		break;  	case 45: /* Sandy Bridge-EP */ @@ -1224,6 +1232,9 @@ static int __init uncore_cpu_init(void)  	case 63: /* Haswell-EP */  		hswep_uncore_cpu_init();  		break; +	case 86: /* BDX-DE */ +		bdx_uncore_cpu_init(); +		break;  	default:  		return 0;  	} diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 0f77f0a196e4..72c54c2e5b1a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -336,6 +336,8 @@ int ivbep_uncore_pci_init(void);  void ivbep_uncore_cpu_init(void);  int hswep_uncore_pci_init(void);  void hswep_uncore_cpu_init(void); +int bdx_uncore_pci_init(void); +void bdx_uncore_cpu_init(void);  /* perf_event_intel_uncore_nhmex.c */  void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index b005a78c7012..f78574b3cb55 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -45,6 +45,11 @@  #define SNB_UNC_CBO_0_PER_CTR0                  0x706  #define SNB_UNC_CBO_MSR_OFFSET                  0x10 +/* SNB ARB register */ +#define SNB_UNC_ARB_PER_CTR0			0x3b0 +#define SNB_UNC_ARB_PERFEVTSEL0			0x3b2 +#define SNB_UNC_ARB_MSR_OFFSET			0x10 +  /* NHM global control register */  #define NHM_UNC_PERF_GLOBAL_CTL                 0x391  #define NHM_UNC_FIXED_CTR                       0x394 @@ -115,7 +120,7 @@ static struct intel_uncore_ops snb_uncore_msr_ops = {  	.read_counter	= uncore_msr_read_counter,  }; -static struct event_constraint snb_uncore_cbox_constraints[] = { +static struct event_constraint snb_uncore_arb_constraints[] = {  	UNCORE_EVENT_CONSTRAINT(0x80, 0x1),  	UNCORE_EVENT_CONSTRAINT(0x83, 0x1),  	EVENT_CONSTRAINT_END @@ -134,14 +139,28 @@ static struct intel_uncore_type snb_uncore_cbox = {  	.single_fixed	= 1,  	.event_mask	= SNB_UNC_RAW_EVENT_MASK,  	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET, -	.constraints	= snb_uncore_cbox_constraints,  	.ops		= &snb_uncore_msr_ops,  	.format_group	= &snb_uncore_format_group,  	.event_descs	= snb_uncore_events,  }; +static struct intel_uncore_type snb_uncore_arb = { +	.name		= "arb", +	.num_counters   = 2, +	.num_boxes	= 1, +	.perf_ctr_bits	= 44, +	.perf_ctr	= SNB_UNC_ARB_PER_CTR0, +	.event_ctl	= SNB_UNC_ARB_PERFEVTSEL0, +	.event_mask	= SNB_UNC_RAW_EVENT_MASK, +	.msr_offset	= SNB_UNC_ARB_MSR_OFFSET, +	.constraints	= snb_uncore_arb_constraints, +	.ops		= &snb_uncore_msr_ops, +	.format_group	= &snb_uncore_format_group, +}; +  static struct intel_uncore_type *snb_msr_uncores[] = {  	&snb_uncore_cbox, +	&snb_uncore_arb,  	NULL,  }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 6d6e85dd5849..694510a887dc 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -2215,7 +2215,7 @@ static struct intel_uncore_type *hswep_pci_uncores[] = {  	NULL,  }; -static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = { +static const struct pci_device_id hswep_uncore_pci_ids[] = {  	{ /* Home Agent 0 */  		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),  		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0), @@ -2321,3 +2321,167 @@ int hswep_uncore_pci_init(void)  	return 0;  }  /* end of Haswell-EP uncore support */ + +/* BDX-DE uncore support */ + +static struct intel_uncore_type bdx_uncore_ubox = { +	.name			= "ubox", +	.num_counters		= 2, +	.num_boxes		= 1, +	.perf_ctr_bits		= 48, +	.fixed_ctr_bits		= 48, +	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0, +	.event_ctl		= HSWEP_U_MSR_PMON_CTL0, +	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK, +	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR, +	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL, +	.num_shared_regs	= 1, +	.ops			= &ivbep_uncore_msr_ops, +	.format_group		= &ivbep_uncore_ubox_format_group, +}; + +static struct event_constraint bdx_uncore_cbox_constraints[] = { +	UNCORE_EVENT_CONSTRAINT(0x09, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x11, 0x1), +	UNCORE_EVENT_CONSTRAINT(0x36, 0x1), +	EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type bdx_uncore_cbox = { +	.name			= "cbox", +	.num_counters		= 4, +	.num_boxes		= 8, +	.perf_ctr_bits		= 48, +	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0, +	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0, +	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, +	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL, +	.msr_offset		= HSWEP_CBO_MSR_OFFSET, +	.num_shared_regs	= 1, +	.constraints		= bdx_uncore_cbox_constraints, +	.ops			= &hswep_uncore_cbox_ops, +	.format_group		= &hswep_uncore_cbox_format_group, +}; + +static struct intel_uncore_type *bdx_msr_uncores[] = { +	&bdx_uncore_ubox, +	&bdx_uncore_cbox, +	&hswep_uncore_pcu, +	NULL, +}; + +void bdx_uncore_cpu_init(void) +{ +	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) +		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; +	uncore_msr_uncores = bdx_msr_uncores; +} + +static struct intel_uncore_type bdx_uncore_ha = { +	.name		= "ha", +	.num_counters   = 4, +	.num_boxes	= 1, +	.perf_ctr_bits	= 48, +	SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type bdx_uncore_imc = { +	.name		= "imc", +	.num_counters   = 5, +	.num_boxes	= 2, +	.perf_ctr_bits	= 48, +	.fixed_ctr_bits	= 48, +	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, +	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, +	.event_descs	= hswep_uncore_imc_events, +	SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type bdx_uncore_irp = { +	.name			= "irp", +	.num_counters		= 4, +	.num_boxes		= 1, +	.perf_ctr_bits		= 48, +	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK, +	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL, +	.ops			= &hswep_uncore_irp_ops, +	.format_group		= &snbep_uncore_format_group, +}; + + +static struct event_constraint bdx_uncore_r2pcie_constraints[] = { +	UNCORE_EVENT_CONSTRAINT(0x10, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x11, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x13, 0x1), +	UNCORE_EVENT_CONSTRAINT(0x23, 0x1), +	UNCORE_EVENT_CONSTRAINT(0x25, 0x1), +	UNCORE_EVENT_CONSTRAINT(0x26, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), +	EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type bdx_uncore_r2pcie = { +	.name		= "r2pcie", +	.num_counters   = 4, +	.num_boxes	= 1, +	.perf_ctr_bits	= 48, +	.constraints	= bdx_uncore_r2pcie_constraints, +	SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +enum { +	BDX_PCI_UNCORE_HA, +	BDX_PCI_UNCORE_IMC, +	BDX_PCI_UNCORE_IRP, +	BDX_PCI_UNCORE_R2PCIE, +}; + +static struct intel_uncore_type *bdx_pci_uncores[] = { +	[BDX_PCI_UNCORE_HA]	= &bdx_uncore_ha, +	[BDX_PCI_UNCORE_IMC]	= &bdx_uncore_imc, +	[BDX_PCI_UNCORE_IRP]	= &bdx_uncore_irp, +	[BDX_PCI_UNCORE_R2PCIE]	= &bdx_uncore_r2pcie, +	NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = { +	{ /* Home Agent 0 */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30), +		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0), +	}, +	{ /* MC0 Channel 0 */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0), +		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0), +	}, +	{ /* MC0 Channel 1 */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1), +		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1), +	}, +	{ /* IRP */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39), +		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0), +	}, +	{ /* R2PCIe */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34), +		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0), +	}, +	{ /* end: all zeroes */ } +}; + +static struct pci_driver bdx_uncore_pci_driver = { +	.name		= "bdx_uncore", +	.id_table	= bdx_uncore_pci_ids, +}; + +int bdx_uncore_pci_init(void) +{ +	int ret = snbep_pci2phy_map_init(0x6f1e); + +	if (ret) +		return ret; +	uncore_pci_uncores = bdx_pci_uncores; +	uncore_pci_driver = &bdx_uncore_pci_driver; +	return 0; +} + +/* end of BDX-DE uncore support */ diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c new file mode 100644 index 000000000000..086b12eae794 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -0,0 +1,242 @@ +#include <linux/perf_event.h> + +enum perf_msr_id { +	PERF_MSR_TSC			= 0, +	PERF_MSR_APERF			= 1, +	PERF_MSR_MPERF			= 2, +	PERF_MSR_PPERF			= 3, +	PERF_MSR_SMI			= 4, + +	PERF_MSR_EVENT_MAX, +}; + +bool test_aperfmperf(int idx) +{ +	return boot_cpu_has(X86_FEATURE_APERFMPERF); +} + +bool test_intel(int idx) +{ +	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || +	    boot_cpu_data.x86 != 6) +		return false; + +	switch (boot_cpu_data.x86_model) { +	case 30: /* 45nm Nehalem    */ +	case 26: /* 45nm Nehalem-EP */ +	case 46: /* 45nm Nehalem-EX */ + +	case 37: /* 32nm Westmere    */ +	case 44: /* 32nm Westmere-EP */ +	case 47: /* 32nm Westmere-EX */ + +	case 42: /* 32nm SandyBridge         */ +	case 45: /* 32nm SandyBridge-E/EN/EP */ + +	case 58: /* 22nm IvyBridge       */ +	case 62: /* 22nm IvyBridge-EP/EX */ + +	case 60: /* 22nm Haswell Core */ +	case 63: /* 22nm Haswell Server */ +	case 69: /* 22nm Haswell ULT */ +	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + +	case 61: /* 14nm Broadwell Core-M */ +	case 86: /* 14nm Broadwell Xeon D */ +	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ +	case 79: /* 14nm Broadwell Server */ + +	case 55: /* 22nm Atom "Silvermont"                */ +	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ +	case 76: /* 14nm Atom "Airmont"                   */ +		if (idx == PERF_MSR_SMI) +			return true; +		break; + +	case 78: /* 14nm Skylake Mobile */ +	case 94: /* 14nm Skylake Desktop */ +		if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) +			return true; +		break; +	} + +	return false; +} + +struct perf_msr { +	u64	msr; +	struct	perf_pmu_events_attr *attr; +	bool	(*test)(int idx); +}; + +PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00"); +PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); +PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); +PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); +PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04"); + +static struct perf_msr msr[] = { +	[PERF_MSR_TSC]   = { 0,			&evattr_tsc,	NULL,		 }, +	[PERF_MSR_APERF] = { MSR_IA32_APERF,	&evattr_aperf,	test_aperfmperf, }, +	[PERF_MSR_MPERF] = { MSR_IA32_MPERF,	&evattr_mperf,	test_aperfmperf, }, +	[PERF_MSR_PPERF] = { MSR_PPERF,		&evattr_pperf,	test_intel,	 }, +	[PERF_MSR_SMI]   = { MSR_SMI_COUNT,	&evattr_smi,	test_intel,	 }, +}; + +static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { +	NULL, +}; + +static struct attribute_group events_attr_group = { +	.name = "events", +	.attrs = events_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); +static struct attribute *format_attrs[] = { +	&format_attr_event.attr, +	NULL, +}; +static struct attribute_group format_attr_group = { +	.name = "format", +	.attrs = format_attrs, +}; + +static const struct attribute_group *attr_groups[] = { +	&events_attr_group, +	&format_attr_group, +	NULL, +}; + +static int msr_event_init(struct perf_event *event) +{ +	u64 cfg = event->attr.config; + +	if (event->attr.type != event->pmu->type) +		return -ENOENT; + +	if (cfg >= PERF_MSR_EVENT_MAX) +		return -EINVAL; + +	/* unsupported modes and filters */ +	if (event->attr.exclude_user   || +	    event->attr.exclude_kernel || +	    event->attr.exclude_hv     || +	    event->attr.exclude_idle   || +	    event->attr.exclude_host   || +	    event->attr.exclude_guest  || +	    event->attr.sample_period) /* no sampling */ +		return -EINVAL; + +	if (!msr[cfg].attr) +		return -EINVAL; + +	event->hw.idx = -1; +	event->hw.event_base = msr[cfg].msr; +	event->hw.config = cfg; + +	return 0; +} + +static inline u64 msr_read_counter(struct perf_event *event) +{ +	u64 now; + +	if (event->hw.event_base) +		rdmsrl(event->hw.event_base, now); +	else +		rdtscll(now); + +	return now; +} +static void msr_event_update(struct perf_event *event) +{ +	u64 prev, now; +	s64 delta; + +	/* Careful, an NMI might modify the previous event value. */ +again: +	prev = local64_read(&event->hw.prev_count); +	now = msr_read_counter(event); + +	if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev) +		goto again; + +	delta = now - prev; +	if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { +		delta <<= 32; +		delta >>= 32; /* sign extend */ +	} +	local64_add(now - prev, &event->count); +} + +static void msr_event_start(struct perf_event *event, int flags) +{ +	u64 now; + +	now = msr_read_counter(event); +	local64_set(&event->hw.prev_count, now); +} + +static void msr_event_stop(struct perf_event *event, int flags) +{ +	msr_event_update(event); +} + +static void msr_event_del(struct perf_event *event, int flags) +{ +	msr_event_stop(event, PERF_EF_UPDATE); +} + +static int msr_event_add(struct perf_event *event, int flags) +{ +	if (flags & PERF_EF_START) +		msr_event_start(event, flags); + +	return 0; +} + +static struct pmu pmu_msr = { +	.task_ctx_nr	= perf_sw_context, +	.attr_groups	= attr_groups, +	.event_init	= msr_event_init, +	.add		= msr_event_add, +	.del		= msr_event_del, +	.start		= msr_event_start, +	.stop		= msr_event_stop, +	.read		= msr_event_update, +	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT, +}; + +static int __init msr_init(void) +{ +	int i, j = 0; + +	if (!boot_cpu_has(X86_FEATURE_TSC)) { +		pr_cont("no MSR PMU driver.\n"); +		return 0; +	} + +	/* Probe the MSRs. */ +	for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) { +		u64 val; + +		/* +		 * Virt sucks arse; you cannot tell if a R/O MSR is present :/ +		 */ +		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) +			msr[i].attr = NULL; +	} + +	/* List remaining MSRs in the sysfs attrs. */ +	for (i = 0; i < PERF_MSR_EVENT_MAX; i++) { +		if (msr[i].attr) +			events_attrs[j++] = &msr[i].attr->attr.attr; +	} +	events_attrs[j] = NULL; + +	perf_pmu_register(&pmu_msr, "msr", -1); + +	return 0; +} +device_initcall(msr_init);  |