diff options
Diffstat (limited to 'kernel/cgroup/cpuset.c')
| -rw-r--r-- | kernel/cgroup/cpuset.c | 158 | 
1 files changed, 60 insertions, 98 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4237c8748715..c12b9fdb22a4 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -202,6 +202,14 @@ struct cpuset {  };  /* + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously + */ +struct cpuset_remove_tasks_struct { +	struct work_struct work; +	struct cpuset *cs; +}; + +/*   * Exclusive CPUs distributed out to sub-partitions of top_cpuset   */  static cpumask_var_t	subpartitions_cpus; @@ -360,9 +368,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)  }  static struct cpuset top_cpuset = { -	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | -		  (1 << CS_MEM_EXCLUSIVE)), +	.flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | +		 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),  	.partition_root_state = PRS_ROOT, +	.relax_domain_level = -1,  	.remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),  }; @@ -449,12 +458,6 @@ static DEFINE_SPINLOCK(callback_lock);  static struct workqueue_struct *cpuset_migrate_mm_wq; -/* - * CPU / memory hotplug is handled asynchronously. - */ -static void cpuset_hotplug_workfn(struct work_struct *work); -static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); -  static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);  static inline void check_insane_mems_config(nodemask_t *nodes) @@ -540,22 +543,10 @@ static void guarantee_online_cpus(struct task_struct *tsk,  	rcu_read_lock();  	cs = task_cs(tsk); -	while (!cpumask_intersects(cs->effective_cpus, pmask)) { +	while (!cpumask_intersects(cs->effective_cpus, pmask))  		cs = parent_cs(cs); -		if (unlikely(!cs)) { -			/* -			 * The top cpuset doesn't have any online cpu as a -			 * consequence of a race between cpuset_hotplug_work -			 * and cpu hotplug notifier.  But we know the top -			 * cpuset's effective_cpus is on its way to be -			 * identical to cpu_online_mask. -			 */ -			goto out_unlock; -		} -	} -	cpumask_and(pmask, pmask, cs->effective_cpus); -out_unlock: +	cpumask_and(pmask, pmask, cs->effective_cpus);  	rcu_read_unlock();  } @@ -1217,7 +1208,7 @@ static void rebuild_sched_domains_locked(void)  	/*  	 * If we have raced with CPU hotplug, return early to avoid  	 * passing doms with offlined cpu to partition_sched_domains(). -	 * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. +	 * Anyways, cpuset_handle_hotplug() will rebuild sched domains.  	 *  	 * With no CPUs in any subpartitions, top_cpuset's effective CPUs  	 * should be the same as the active CPUs, so checking only top_cpuset @@ -1260,12 +1251,17 @@ static void rebuild_sched_domains_locked(void)  }  #endif /* CONFIG_SMP */ -void rebuild_sched_domains(void) +static void rebuild_sched_domains_cpuslocked(void)  { -	cpus_read_lock();  	mutex_lock(&cpuset_mutex);  	rebuild_sched_domains_locked();  	mutex_unlock(&cpuset_mutex); +} + +void rebuild_sched_domains(void) +{ +	cpus_read_lock(); +	rebuild_sched_domains_cpuslocked();  	cpus_read_unlock();  } @@ -2079,14 +2075,11 @@ write_error:  	/*  	 * For partcmd_update without newmask, it is being called from -	 * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. -	 * Update the load balance flag and scheduling domain if -	 * cpus_read_trylock() is successful. +	 * cpuset_handle_hotplug(). Update the load balance flag and +	 * scheduling domain accordingly.  	 */ -	if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { +	if ((cmd == partcmd_update) && !newmask)  		update_partition_sd_lb(cs, old_prs); -		cpus_read_unlock(); -	}  	notify_partition_change(cs, old_prs);  	return 0; @@ -2948,7 +2941,7 @@ bool current_cpuset_is_being_rebound(void)  static int update_relax_domain_level(struct cpuset *cs, s64 val)  {  #ifdef CONFIG_SMP -	if (val < -1 || val >= sched_domain_level_max) +	if (val < -1 || val > sched_domain_level_max + 1)  		return -EINVAL;  #endif @@ -3599,8 +3592,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,  	 * proceeding, so that we don't end up keep removing tasks added  	 * after execution capability is restored.  	 * -	 * cpuset_hotplug_work calls back into cgroup core via -	 * cgroup_transfer_tasks() and waiting for it from a cgroupfs +	 * cpuset_handle_hotplug may call back into cgroup core asynchronously +	 * via cgroup_transfer_tasks() and waiting for it from a cgroupfs  	 * operation like this one can lead to a deadlock through kernfs  	 * active_ref protection.  Let's break the protection.  Losing the  	 * protection is okay as we check whether @cs is online after @@ -3609,7 +3602,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,  	 */  	css_get(&cs->css);  	kernfs_break_active_protection(of->kn); -	flush_work(&cpuset_hotplug_work);  	cpus_read_lock();  	mutex_lock(&cpuset_mutex); @@ -3782,9 +3774,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,  	buf = strstrip(buf); -	/* -	 * Convert "root" to ENABLED, and convert "member" to DISABLED. -	 */  	if (!strcmp(buf, "root"))  		val = PRS_ROOT;  	else if (!strcmp(buf, "member")) @@ -4060,11 +4049,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)  		cs->effective_mems = parent->effective_mems;  		cs->use_parent_ecpus = true;  		parent->child_ecpus_count++; -		/* -		 * Clear CS_SCHED_LOAD_BALANCE if parent is isolated -		 */ -		if (!is_sched_load_balance(parent)) -			clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);  	}  	/* @@ -4318,8 +4302,6 @@ int __init cpuset_init(void)  	nodes_setall(top_cpuset.effective_mems);  	fmeter_init(&top_cpuset.fmeter); -	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); -	top_cpuset.relax_domain_level = -1;  	INIT_LIST_HEAD(&remote_children);  	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4354,6 +4336,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)  	}  } +static void cpuset_migrate_tasks_workfn(struct work_struct *work) +{ +	struct cpuset_remove_tasks_struct *s; + +	s = container_of(work, struct cpuset_remove_tasks_struct, work); +	remove_tasks_in_empty_cpuset(s->cs); +	css_put(&s->cs->css); +	kfree(s); +} +  static void  hotplug_update_tasks_legacy(struct cpuset *cs,  			    struct cpumask *new_cpus, nodemask_t *new_mems, @@ -4383,12 +4375,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs,  	/*  	 * Move tasks to the nearest ancestor with execution resources,  	 * This is full cgroup operation which will also call back into -	 * cpuset. Should be done outside any lock. +	 * cpuset. Execute it asynchronously using workqueue.  	 */ -	if (is_empty) { -		mutex_unlock(&cpuset_mutex); -		remove_tasks_in_empty_cpuset(cs); -		mutex_lock(&cpuset_mutex); +	if (is_empty && cs->css.cgroup->nr_populated_csets && +	    css_tryget_online(&cs->css)) { +		struct cpuset_remove_tasks_struct *s; + +		s = kzalloc(sizeof(*s), GFP_KERNEL); +		if (WARN_ON_ONCE(!s)) { +			css_put(&cs->css); +			return; +		} + +		s->cs = cs; +		INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); +		schedule_work(&s->work);  	}  } @@ -4421,30 +4422,6 @@ void cpuset_force_rebuild(void)  	force_rebuild = true;  } -/* - * Attempt to acquire a cpus_read_lock while a hotplug operation may be in - * progress. - * Return: true if successful, false otherwise - * - * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ -static bool cpuset_hotplug_cpus_read_trylock(void) -{ -	int retries = 0; - -	while (!cpus_read_trylock()) { -		/* -		 * CPU hotplug still in progress. Retry 5 times -		 * with a 10ms wait before bailing out. -		 */ -		if (++retries > 5) -			return false; -		msleep(10); -	} -	return true; -} -  /**   * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug   * @cs: cpuset in interest @@ -4493,13 +4470,11 @@ retry:  		compute_partition_effective_cpumask(cs, &new_cpus);  	if (remote && cpumask_empty(&new_cpus) && -	    partition_is_populated(cs, NULL) && -	    cpuset_hotplug_cpus_read_trylock()) { +	    partition_is_populated(cs, NULL)) {  		remote_partition_disable(cs, tmp);  		compute_effective_cpumask(&new_cpus, cs, parent);  		remote = false;  		cpuset_force_rebuild(); -		cpus_read_unlock();  	}  	/* @@ -4519,18 +4494,8 @@ retry:  	else if (is_partition_valid(parent) && is_partition_invalid(cs))  		partcmd = partcmd_update; -	/* -	 * cpus_read_lock needs to be held before calling -	 * update_parent_effective_cpumask(). To avoid circular lock -	 * dependency between cpuset_mutex and cpus_read_lock, -	 * cpus_read_trylock() is used here to acquire the lock. -	 */  	if (partcmd >= 0) { -		if (!cpuset_hotplug_cpus_read_trylock()) -			goto update_tasks; -  		update_parent_effective_cpumask(cs, partcmd, NULL, tmp); -		cpus_read_unlock();  		if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {  			compute_partition_effective_cpumask(cs, &new_cpus);  			cpuset_force_rebuild(); @@ -4558,8 +4523,7 @@ unlock:  }  /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset - * @work: unused + * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset   *   * This function is called after either CPU or memory configuration has   * changed and updates cpuset accordingly.  The top_cpuset is always @@ -4573,8 +4537,10 @@ unlock:   *   * Note that CPU offlining during suspend is ignored.  We don't modify   * cpusets across suspend/resume cycles at all. + * + * CPU / memory hotplug is handled synchronously.   */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_handle_hotplug(void)  {  	static cpumask_t new_cpus;  	static nodemask_t new_mems; @@ -4585,6 +4551,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	if (on_dfl && !alloc_cpumasks(NULL, &tmp))  		ptmp = &tmp; +	lockdep_assert_cpus_held();  	mutex_lock(&cpuset_mutex);  	/* fetch the available cpus/mems and find out which changed how */ @@ -4666,7 +4633,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	/* rebuild sched domains if cpus_allowed has changed */  	if (cpus_updated || force_rebuild) {  		force_rebuild = false; -		rebuild_sched_domains(); +		rebuild_sched_domains_cpuslocked();  	}  	free_cpumasks(NULL, ptmp); @@ -4679,12 +4646,7 @@ void cpuset_update_active_cpus(void)  	 * inside cgroup synchronization.  Bounce actual hotplug processing  	 * to a work item to avoid reverse locking order.  	 */ -	schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ -	flush_work(&cpuset_hotplug_work); +	cpuset_handle_hotplug();  }  /* @@ -4695,7 +4657,7 @@ void cpuset_wait_for_hotplug(void)  static int cpuset_track_online_nodes(struct notifier_block *self,  				unsigned long action, void *arg)  { -	schedule_work(&cpuset_hotplug_work); +	cpuset_handle_hotplug();  	return NOTIFY_OK;  }  |