diff options
Diffstat (limited to 'kernel/cgroup/cpuset.c')
| -rw-r--r-- | kernel/cgroup/cpuset.c | 197 | 
1 files changed, 142 insertions, 55 deletions
| diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c12b9fdb22a4..40ec4abaf440 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,6 +21,7 @@   *  License.  See the file COPYING in the main directory of the Linux   *  distribution for more details.   */ +#include "cgroup-internal.h"  #include <linux/cpu.h>  #include <linux/cpumask.h> @@ -87,7 +88,7 @@ static const char * const perr_strings[] = {  	[PERR_NOTEXCL]   = "Cpu list in cpuset.cpus not exclusive",  	[PERR_NOCPUS]    = "Parent unable to distribute cpu downstream",  	[PERR_HOTPLUG]   = "No cpu available due to hotplug", -	[PERR_CPUSEMPTY] = "cpuset.cpus is empty", +	[PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",  	[PERR_HKEEPING]  = "partition config conflicts with housekeeping setup",  }; @@ -127,19 +128,28 @@ struct cpuset {  	/*  	 * Exclusive CPUs dedicated to current cgroup (default hierarchy only)  	 * -	 * This exclusive CPUs must be a subset of cpus_allowed. A parent -	 * cgroup can only grant exclusive CPUs to one of its children. +	 * The effective_cpus of a valid partition root comes solely from its +	 * effective_xcpus and some of the effective_xcpus may be distributed +	 * to sub-partitions below & hence excluded from its effective_cpus. +	 * For a valid partition root, its effective_cpus have no relationship +	 * with cpus_allowed unless its exclusive_cpus isn't set.  	 * -	 * When the cgroup becomes a valid partition root, effective_xcpus -	 * defaults to cpus_allowed if not set. The effective_cpus of a valid -	 * partition root comes solely from its effective_xcpus and some of the -	 * effective_xcpus may be distributed to sub-partitions below & hence -	 * excluded from its effective_cpus. +	 * This value will only be set if either exclusive_cpus is set or +	 * when this cpuset becomes a local partition root.  	 */  	cpumask_var_t effective_xcpus;  	/*  	 * Exclusive CPUs as requested by the user (default hierarchy only) +	 * +	 * Its value is independent of cpus_allowed and designates the set of +	 * CPUs that can be granted to the current cpuset or its children when +	 * it becomes a valid partition root. The effective set of exclusive +	 * CPUs granted (effective_xcpus) depends on whether those exclusive +	 * CPUs are passed down by its ancestors and not yet taken up by +	 * another sibling partition root along the way. +	 * +	 * If its value isn't set, it defaults to cpus_allowed.  	 */  	cpumask_var_t exclusive_cpus; @@ -169,7 +179,7 @@ struct cpuset {  	/* for custom sched domain */  	int relax_domain_level; -	/* number of valid sub-partitions */ +	/* number of valid local child partitions */  	int nr_subparts;  	/* partition root state */ @@ -230,6 +240,17 @@ static struct list_head remote_children;   *   2 - partition root without load balancing (isolated)   *  -1 - invalid partition root   *  -2 - invalid isolated partition root + * + *  There are 2 types of partitions - local or remote. Local partitions are + *  those whose parents are partition root themselves. Setting of + *  cpuset.cpus.exclusive are optional in setting up local partitions. + *  Remote partitions are those whose parents are not partition roots. Passing + *  down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor + *  nodes are mandatory in creating a remote partition. + * + *  For simplicity, a local partition can be created under a local or remote + *  partition but a remote partition cannot have any partition root in its + *  ancestor chain except the cgroup root.   */  #define PRS_MEMBER		0  #define PRS_ROOT		1 @@ -434,7 +455,7 @@ static struct cpuset top_cpuset = {   * by other task, we use alloc_lock in the task_struct fields to protect   * them.   * - * The cpuset_common_file_read() handlers only hold callback_lock across + * The cpuset_common_seq_show() handlers only hold callback_lock across   * small pieces of code, such as when reading out possibly multi-word   * cpumasks and nodemasks.   * @@ -709,6 +730,19 @@ static inline void free_cpuset(struct cpuset *cs)  	kfree(cs);  } +/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) +{ +	return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed +						 : cs->exclusive_cpus; +} + +static inline bool xcpus_empty(struct cpuset *cs) +{ +	return cpumask_empty(cs->cpus_allowed) && +	       cpumask_empty(cs->exclusive_cpus); +} +  static inline struct cpumask *fetch_xcpus(struct cpuset *cs)  {  	return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : @@ -825,17 +859,41 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)  	/*  	 * If either I or some sibling (!= me) is exclusive, we can't -	 * overlap +	 * overlap. exclusive_cpus cannot overlap with each other if set.  	 */  	ret = -EINVAL;  	cpuset_for_each_child(c, css, par) { -		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && -		    c != cur) { +		bool txset, cxset;	/* Are exclusive_cpus set? */ + +		if (c == cur) +			continue; + +		txset = !cpumask_empty(trial->exclusive_cpus); +		cxset = !cpumask_empty(c->exclusive_cpus); +		if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || +		    (txset && cxset)) {  			if (!cpusets_are_exclusive(trial, c))  				goto out; +		} else if (txset || cxset) { +			struct cpumask *xcpus, *acpus; + +			/* +			 * When just one of the exclusive_cpus's is set, +			 * cpus_allowed of the other cpuset, if set, cannot be +			 * a subset of it or none of those CPUs will be +			 * available if these exclusive CPUs are activated. +			 */ +			if (txset) { +				xcpus = trial->exclusive_cpus; +				acpus = c->cpus_allowed; +			} else { +				xcpus = c->exclusive_cpus; +				acpus = trial->cpus_allowed; +			} +			if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) +				goto out;  		}  		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && -		    c != cur &&  		    nodes_intersects(trial->mems_allowed, c->mems_allowed))  			goto out;  	} @@ -957,13 +1015,15 @@ static int generate_sched_domains(cpumask_var_t **domains,  	int nslot;		/* next empty doms[] struct cpumask slot */  	struct cgroup_subsys_state *pos_css;  	bool root_load_balance = is_sched_load_balance(&top_cpuset); +	bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);  	doms = NULL;  	dattr = NULL;  	csa = NULL;  	/* Special case for the 99% of systems with one, full, sched domain */ -	if (root_load_balance && !top_cpuset.nr_subparts) { +	if (root_load_balance && cpumask_empty(subpartitions_cpus)) { +single_root_domain:  		ndoms = 1;  		doms = alloc_sched_domains(ndoms);  		if (!doms) @@ -991,16 +1051,18 @@ static int generate_sched_domains(cpumask_var_t **domains,  	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {  		if (cp == &top_cpuset)  			continue; + +		if (cgrpv2) +			goto v2; +  		/* +		 * v1:  		 * Continue traversing beyond @cp iff @cp has some CPUs and  		 * isn't load balancing.  The former is obvious.  The  		 * latter: All child cpusets contain a subset of the  		 * parent's cpus, so just skip them, and then we call  		 * update_domain_attr_tree() to calc relax_domain_level of  		 * the corresponding sched domain. -		 * -		 * If root is load-balancing, we can skip @cp if it -		 * is a subset of the root's effective_cpus.  		 */  		if (!cpumask_empty(cp->cpus_allowed) &&  		    !(is_sched_load_balance(cp) && @@ -1008,20 +1070,39 @@ static int generate_sched_domains(cpumask_var_t **domains,  					 housekeeping_cpumask(HK_TYPE_DOMAIN))))  			continue; -		if (root_load_balance && -		    cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) -			continue; -  		if (is_sched_load_balance(cp) &&  		    !cpumask_empty(cp->effective_cpus))  			csa[csn++] = cp; -		/* skip @cp's subtree if not a partition root */ -		if (!is_partition_valid(cp)) +		/* skip @cp's subtree */ +		pos_css = css_rightmost_descendant(pos_css); +		continue; + +v2: +		/* +		 * Only valid partition roots that are not isolated and with +		 * non-empty effective_cpus will be saved into csn[]. +		 */ +		if ((cp->partition_root_state == PRS_ROOT) && +		    !cpumask_empty(cp->effective_cpus)) +			csa[csn++] = cp; + +		/* +		 * Skip @cp's subtree if not a partition root and has no +		 * exclusive CPUs to be granted to child cpusets. +		 */ +		if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))  			pos_css = css_rightmost_descendant(pos_css);  	}  	rcu_read_unlock(); +	/* +	 * If there are only isolated partitions underneath the cgroup root, +	 * we can optimize out unneeded sched domains scanning. +	 */ +	if (root_load_balance && (csn == 1)) +		goto single_root_domain; +  	for (i = 0; i < csn; i++)  		csa[i]->pn = i;  	ndoms = csn; @@ -1064,6 +1145,20 @@ restart:  	dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),  			      GFP_KERNEL); +	/* +	 * Cgroup v2 doesn't support domain attributes, just set all of them +	 * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a +	 * subset of HK_TYPE_DOMAIN housekeeping CPUs. +	 */ +	if (cgrpv2) { +		for (i = 0; i < ndoms; i++) { +			cpumask_copy(doms[i], csa[i]->effective_cpus); +			if (dattr) +				dattr[i] = SD_ATTR_INIT; +		} +		goto done; +	} +  	for (nslot = 0, i = 0; i < csn; i++) {  		struct cpuset *a = csa[i];  		struct cpumask *dp; @@ -1223,7 +1318,7 @@ static void rebuild_sched_domains_locked(void)  	 * root should be only a subset of the active CPUs.  Since a CPU in any  	 * partition root could be offlined, all must be checked.  	 */ -	if (top_cpuset.nr_subparts) { +	if (!cpumask_empty(subpartitions_cpus)) {  		rcu_read_lock();  		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {  			if (!is_partition_valid(cs)) { @@ -1338,7 +1433,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,   */  static int update_partition_exclusive(struct cpuset *cs, int new_prs)  { -	bool exclusive = (new_prs > 0); +	bool exclusive = (new_prs > PRS_MEMBER);  	if (exclusive && !is_cpu_exclusive(cs)) {  		if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) @@ -1532,7 +1627,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);   * Return: true if xcpus is not empty, false otherwise.   *   * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), - * it must be a subset of cpus_allowed and parent's effective_xcpus. + * it must be a subset of parent's effective_xcpus.   */  static bool compute_effective_exclusive_cpumask(struct cpuset *cs,  						struct cpumask *xcpus) @@ -1542,12 +1637,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs,  	if (!xcpus)  		xcpus = cs->effective_xcpus; -	if (!cpumask_empty(cs->exclusive_cpus)) -		cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); -	else -		cpumask_copy(xcpus, cs->cpus_allowed); - -	return cpumask_and(xcpus, xcpus, parent->effective_xcpus); +	return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);  }  static inline bool is_remote_partition(struct cpuset *cs) @@ -1826,8 +1916,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,  	 */  	adding = deleting = false;  	old_prs = new_prs = cs->partition_root_state; -	xcpus = !cpumask_empty(cs->exclusive_cpus) -		? cs->effective_xcpus : cs->cpus_allowed; +	xcpus = user_xcpus(cs);  	if (cmd == partcmd_invalidate) {  		if (is_prs_invalid(old_prs)) @@ -1855,7 +1944,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,  		return is_partition_invalid(parent)  		       ? PERR_INVPARENT : PERR_NOTPART;  	} -	if (!newmask && cpumask_empty(cs->cpus_allowed)) +	if (!newmask && xcpus_empty(cs))  		return PERR_CPUSEMPTY;  	nocpu = tasks_nocpu_error(parent, cs, xcpus); @@ -2583,8 +2672,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,  		retval = cpulist_parse(buf, trialcs->exclusive_cpus);  		if (retval < 0)  			return retval; -		if (!is_cpu_exclusive(cs)) -			set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);  	}  	/* Nothing to do if the CPUs didn't change */ @@ -3071,9 +3158,9 @@ static int update_prstate(struct cpuset *cs, int new_prs)  				       ? partcmd_enable : partcmd_enablei;  		/* -		 * cpus_allowed cannot be empty. +		 * cpus_allowed and exclusive_cpus cannot be both empty.  		 */ -		if (cpumask_empty(cs->cpus_allowed)) { +		if (xcpus_empty(cs)) {  			err = PERR_CPUSEMPTY;  			goto out;  		} @@ -4009,8 +4096,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)  	}  	__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); -	nodes_clear(cs->mems_allowed); -	nodes_clear(cs->effective_mems);  	fmeter_init(&cs->fmeter);  	cs->relax_domain_level = -1;  	INIT_LIST_HEAD(&cs->remote_sibling); @@ -4040,6 +4125,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)  		set_bit(CS_SPREAD_PAGE, &cs->flags);  	if (is_spread_slab(parent))  		set_bit(CS_SPREAD_SLAB, &cs->flags); +	/* +	 * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated +	 */ +	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && +	    !is_sched_load_balance(parent)) +		clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);  	cpuset_inc(); @@ -4050,14 +4141,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)  		cs->use_parent_ecpus = true;  		parent->child_ecpus_count++;  	} - -	/* -	 * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated -	 */ -	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && -	    !is_sched_load_balance(parent)) -		clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); -  	spin_unlock_irq(&callback_lock);  	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) @@ -4571,7 +4654,7 @@ static void cpuset_handle_hotplug(void)  	 * In the rare case that hotplug removes all the cpus in  	 * subpartitions_cpus, we assumed that cpus are updated.  	 */ -	if (!cpus_updated && top_cpuset.nr_subparts) +	if (!cpus_updated && !cpumask_empty(subpartitions_cpus))  		cpus_updated = true;  	/* For v1, synchronize cpus_allowed to cpu_active_mask */ @@ -5051,10 +5134,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,  	if (!buf)  		goto out; -	css = task_get_css(tsk, cpuset_cgrp_id); -	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, -				current->nsproxy->cgroup_ns); -	css_put(css); +	rcu_read_lock(); +	spin_lock_irq(&css_set_lock); +	css = task_css(tsk, cpuset_cgrp_id); +	retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, +				       current->nsproxy->cgroup_ns); +	spin_unlock_irq(&css_set_lock); +	rcu_read_unlock(); +  	if (retval == -E2BIG)  		retval = -ENAMETOOLONG;  	if (retval < 0) |