diff options
Diffstat (limited to 'arch/x86/kernel/cpu/resctrl/monitor.c')
| -rw-r--r-- | arch/x86/kernel/cpu/resctrl/monitor.c | 250 | 
1 files changed, 206 insertions, 44 deletions
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 366f496ca3ce..851b561850e0 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -15,6 +15,8 @@   * Software Developer Manual June 2016, volume 3, section 17.17.   */ +#define pr_fmt(fmt)	"resctrl: " fmt +  #include <linux/cpu.h>  #include <linux/module.h>  #include <linux/sizes.h> @@ -97,6 +99,8 @@ unsigned int resctrl_rmid_realloc_limit;  #define CF(cf)	((unsigned long)(1048576 * (cf) + 0.5)) +static int snc_nodes_per_l3_cache = 1; +  /*   * The correction factor table is documented in Documentation/arch/x86/resctrl.rst.   * If rmid > rmid threshold, MBM total and local values should be multiplied @@ -185,7 +189,43 @@ static inline struct rmid_entry *__rmid_entry(u32 idx)  	return entry;  } -static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) +/* + * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by + * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is + * needed. The physical RMID is the same as the logical RMID. + * + * On a platform with SNC mode enabled, Linux enables RMID sharing mode + * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel + * Resource Director Technology Architecture Specification" for a full + * description of RMID sharing mode). + * + * In RMID sharing mode there are fewer "logical RMID" values available + * to accumulate data ("physical RMIDs" are divided evenly between SNC + * nodes that share an L3 cache). Linux creates an rdt_mon_domain for + * each SNC node. + * + * The value loaded into IA32_PQR_ASSOC is the "logical RMID". + * + * Data is collected independently on each SNC node and can be retrieved + * using the "physical RMID" value computed by this function and loaded + * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. + * + * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 + * cache.  So a "physical RMID" may be read from any CPU that shares + * the L3 cache with the desired SNC node, not just from a CPU in + * the specific SNC node. + */ +static int logical_rmid_to_physical_rmid(int cpu, int lrmid) +{ +	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + +	if (snc_nodes_per_l3_cache == 1) +		return lrmid; + +	return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; +} + +static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)  {  	u64 msr_val; @@ -197,7 +237,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)  	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)  	 * are error bits.  	 */ -	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); +	wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);  	rdmsrl(MSR_IA32_QM_CTR, msr_val);  	if (msr_val & RMID_VAL_ERROR) @@ -209,7 +249,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)  	return 0;  } -static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,  						 u32 rmid,  						 enum resctrl_event_id eventid)  { @@ -228,19 +268,22 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,  	return NULL;  } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,  			     u32 unused, u32 rmid,  			     enum resctrl_event_id eventid)  { -	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); +	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); +	int cpu = cpumask_any(&d->hdr.cpu_mask);  	struct arch_mbm_state *am; +	u32 prmid;  	am = get_arch_mbm_state(hw_dom, rmid, eventid);  	if (am) {  		memset(am, 0, sizeof(*am)); +		prmid = logical_rmid_to_physical_rmid(cpu, rmid);  		/* Record any initial, non-zero count value. */ -		__rmid_read(rmid, eventid, &am->prev_msr); +		__rmid_read_phys(prmid, eventid, &am->prev_msr);  	}  } @@ -248,9 +291,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,   * Assumes that hardware counters are also reset and thus that there is   * no need to record initial non-zero counts.   */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d) +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)  { -	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); +	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);  	if (is_mbm_total_enabled())  		memset(hw_dom->arch_mbm_total, 0, @@ -269,22 +312,22 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)  	return chunks >> shift;  } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,  			   u32 unused, u32 rmid, enum resctrl_event_id eventid,  			   u64 *val, void *ignored)  { +	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);  	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); -	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); +	int cpu = cpumask_any(&d->hdr.cpu_mask);  	struct arch_mbm_state *am;  	u64 msr_val, chunks; +	u32 prmid;  	int ret;  	resctrl_arch_rmid_read_context_check(); -	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) -		return -EINVAL; - -	ret = __rmid_read(rmid, eventid, &msr_val); +	prmid = logical_rmid_to_physical_rmid(cpu, rmid); +	ret = __rmid_read_phys(prmid, eventid, &msr_val);  	if (ret)  		return ret; @@ -320,7 +363,7 @@ static void limbo_release_entry(struct rmid_entry *entry)   * decrement the count. If the busy count gets to zero on an RMID, we   * free the RMID   */ -void __check_limbo(struct rdt_domain *d, bool force_free) +void __check_limbo(struct rdt_mon_domain *d, bool force_free)  {  	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;  	u32 idx_limit = resctrl_arch_system_num_rmid_idx(); @@ -364,7 +407,7 @@ void __check_limbo(struct rdt_domain *d, bool force_free)  			 * CLOSID and RMID because there may be dependencies between them  			 * on some architectures.  			 */ -			trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->id, val); +			trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val);  		}  		if (force_free || !rmid_dirty) { @@ -378,7 +421,7 @@ void __check_limbo(struct rdt_domain *d, bool force_free)  	resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);  } -bool has_busy_rmid(struct rdt_domain *d) +bool has_busy_rmid(struct rdt_mon_domain *d)  {  	u32 idx_limit = resctrl_arch_system_num_rmid_idx(); @@ -479,7 +522,7 @@ int alloc_rmid(u32 closid)  static void add_rmid_to_limbo(struct rmid_entry *entry)  {  	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; -	struct rdt_domain *d; +	struct rdt_mon_domain *d;  	u32 idx;  	lockdep_assert_held(&rdtgroup_mutex); @@ -490,7 +533,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)  	idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);  	entry->busy = 0; -	list_for_each_entry(d, &r->domains, list) { +	list_for_each_entry(d, &r->mon_domains, hdr.list) {  		/*  		 * For the first limbo RMID in the domain,  		 * setup up the limbo worker. @@ -532,7 +575,7 @@ void free_rmid(u32 closid, u32 rmid)  		list_add_tail(&entry->list, &rmid_free_lru);  } -static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid, +static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid,  				       u32 rmid, enum resctrl_event_id evtid)  {  	u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); @@ -549,7 +592,10 @@ static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid,  static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)  { +	int cpu = smp_processor_id(); +	struct rdt_mon_domain *d;  	struct mbm_state *m; +	int err, ret;  	u64 tval = 0;  	if (rr->first) { @@ -560,14 +606,47 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)  		return 0;  	} -	rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid, -					 &tval, rr->arch_mon_ctx); -	if (rr->err) -		return rr->err; +	if (rr->d) { +		/* Reading a single domain, must be on a CPU in that domain. */ +		if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) +			return -EINVAL; +		rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, +						 rr->evtid, &tval, rr->arch_mon_ctx); +		if (rr->err) +			return rr->err; -	rr->val += tval; +		rr->val += tval; -	return 0; +		return 0; +	} + +	/* Summing domains that share a cache, must be on a CPU for that cache. */ +	if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) +		return -EINVAL; + +	/* +	 * Legacy files must report the sum of an event across all +	 * domains that share the same L3 cache instance. +	 * Report success if a read from any domain succeeds, -EINVAL +	 * (translated to "Unavailable" for user space) if reading from +	 * all domains fail for any reason. +	 */ +	ret = -EINVAL; +	list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { +		if (d->ci->id != rr->ci->id) +			continue; +		err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, +					     rr->evtid, &tval, rr->arch_mon_ctx); +		if (!err) { +			rr->val += tval; +			ret = 0; +		} +	} + +	if (ret) +		rr->err = ret; + +	return ret;  }  /* @@ -668,12 +747,12 @@ void mon_event_count(void *info)   * throttle MSRs already have low percentage values.  To avoid   * unnecessarily restricting such rdtgroups, we also increase the bandwidth.   */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)  {  	u32 closid, rmid, cur_msr_val, new_msr_val;  	struct mbm_state *pmbm_data, *cmbm_data; +	struct rdt_ctrl_domain *dom_mba;  	struct rdt_resource *r_mba; -	struct rdt_domain *dom_mba;  	u32 cur_bw, user_bw, idx;  	struct list_head *head;  	struct rdtgroup *entry; @@ -688,7 +767,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)  	idx = resctrl_arch_rmid_idx_encode(closid, rmid);  	pmbm_data = &dom_mbm->mbm_local[idx]; -	dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); +	dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);  	if (!dom_mba) {  		pr_warn_once("Failure to get domain for MBA update\n");  		return; @@ -734,12 +813,11 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)  	resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);  } -static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,  		       u32 closid, u32 rmid)  { -	struct rmid_read rr; +	struct rmid_read rr = {0}; -	rr.first = false;  	rr.r = r;  	rr.d = d; @@ -792,17 +870,17 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,  void cqm_handle_limbo(struct work_struct *work)  {  	unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); -	struct rdt_domain *d; +	struct rdt_mon_domain *d;  	cpus_read_lock();  	mutex_lock(&rdtgroup_mutex); -	d = container_of(work, struct rdt_domain, cqm_limbo.work); +	d = container_of(work, struct rdt_mon_domain, cqm_limbo.work);  	__check_limbo(d, false);  	if (has_busy_rmid(d)) { -		d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, +		d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,  							   RESCTRL_PICK_ANY_CPU);  		schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,  					 delay); @@ -820,13 +898,13 @@ void cqm_handle_limbo(struct work_struct *work)   * @exclude_cpu:   Which CPU the handler should not run on,   *		   RESCTRL_PICK_ANY_CPU to pick any CPU.   */ -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,  			     int exclude_cpu)  {  	unsigned long delay = msecs_to_jiffies(delay_ms);  	int cpu; -	cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); +	cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);  	dom->cqm_work_cpu = cpu;  	if (cpu < nr_cpu_ids) @@ -837,9 +915,9 @@ void mbm_handle_overflow(struct work_struct *work)  {  	unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);  	struct rdtgroup *prgrp, *crgrp; +	struct rdt_mon_domain *d;  	struct list_head *head;  	struct rdt_resource *r; -	struct rdt_domain *d;  	cpus_read_lock();  	mutex_lock(&rdtgroup_mutex); @@ -852,7 +930,7 @@ void mbm_handle_overflow(struct work_struct *work)  		goto out_unlock;  	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; -	d = container_of(work, struct rdt_domain, mbm_over.work); +	d = container_of(work, struct rdt_mon_domain, mbm_over.work);  	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {  		mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); @@ -869,7 +947,7 @@ void mbm_handle_overflow(struct work_struct *work)  	 * Re-check for housekeeping CPUs. This allows the overflow handler to  	 * move off a nohz_full CPU quickly.  	 */ -	d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, +	d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,  						   RESCTRL_PICK_ANY_CPU);  	schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); @@ -886,7 +964,7 @@ out_unlock:   * @exclude_cpu:   Which CPU the handler should not run on,   *		   RESCTRL_PICK_ANY_CPU to pick any CPU.   */ -void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms, +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,  				int exclude_cpu)  {  	unsigned long delay = msecs_to_jiffies(delay_ms); @@ -898,7 +976,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms,  	 */  	if (!resctrl_mounted || !resctrl_arch_mon_capable())  		return; -	cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); +	cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);  	dom->mbm_work_cpu = cpu;  	if (cpu < nr_cpu_ids) @@ -1015,6 +1093,88 @@ static void l3_mon_evt_init(struct rdt_resource *r)  		list_add_tail(&mbm_local_event.list, &r->evt_list);  } +/* + * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 + * which indicates that RMIDs are configured in legacy mode. + * This mode is incompatible with Linux resctrl semantics + * as RMIDs are partitioned between SNC nodes, which requires + * a user to know which RMID is allocated to a task. + * Clearing bit 0 reconfigures the RMID counters for use + * in RMID sharing mode. This mode is better for Linux. + * The RMID space is divided between all SNC nodes with the + * RMIDs renumbered to start from zero in each node when + * counting operations from tasks. Code to read the counters + * must adjust RMID counter numbers based on SNC node. See + * logical_rmid_to_physical_rmid() for code that does this. + */ +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) +{ +	if (snc_nodes_per_l3_cache > 1) +		msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); +} + +/* CPU models that support MSR_RMID_SNC_CONFIG */ +static const struct x86_cpu_id snc_cpu_ids[] __initconst = { +	X86_MATCH_VFM(INTEL_ICELAKE_X, 0), +	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), +	X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), +	X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), +	X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), +	{} +}; + +/* + * There isn't a simple hardware bit that indicates whether a CPU is running + * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the + * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in + * the same NUMA node as CPU0. + * It is not possible to accurately determine SNC state if the system is + * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes + * to L3 caches. It will be OK if system is booted with hyperthreading + * disabled (since this doesn't affect the ratio). + */ +static __init int snc_get_config(void) +{ +	struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); +	const cpumask_t *node0_cpumask; +	int cpus_per_node, cpus_per_l3; +	int ret; + +	if (!x86_match_cpu(snc_cpu_ids) || !ci) +		return 1; + +	cpus_read_lock(); +	if (num_online_cpus() != num_present_cpus()) +		pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); +	cpus_read_unlock(); + +	node0_cpumask = cpumask_of_node(cpu_to_node(0)); + +	cpus_per_node = cpumask_weight(node0_cpumask); +	cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); + +	if (!cpus_per_node || !cpus_per_l3) +		return 1; + +	ret = cpus_per_l3 / cpus_per_node; + +	/* sanity check: Only valid results are 1, 2, 3, 4 */ +	switch (ret) { +	case 1: +		break; +	case 2 ... 4: +		pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); +		rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; +		break; +	default: +		pr_warn("Ignore improbable SNC node count %d\n", ret); +		ret = 1; +		break; +	} + +	return ret; +} +  int __init rdt_get_mon_l3_config(struct rdt_resource *r)  {  	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; @@ -1022,9 +1182,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)  	unsigned int threshold;  	int ret; +	snc_nodes_per_l3_cache = snc_get_config(); +  	resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; -	hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; -	r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; +	hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; +	r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;  	hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;  	if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)  |