diff options
Diffstat (limited to 'kernel/cgroup.c')
| -rw-r--r-- | kernel/cgroup.c | 239 | 
1 files changed, 90 insertions, 149 deletions
| diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788cfd52..136eceadeed1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;  static struct cftype cgroup_dfl_base_files[];  static struct cftype cgroup_legacy_base_files[]; -static void cgroup_put(struct cgroup *cgrp);  static int rebind_subsystems(struct cgroup_root *dst_root,  			     unsigned int ss_mask);  static int cgroup_destroy_locked(struct cgroup *cgrp); @@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref);  static void kill_css(struct cgroup_subsys_state *css);  static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],  			      bool is_add); -static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);  /* IDR wrappers which synchronize using cgroup_idr_lock */  static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, @@ -331,14 +329,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)  	return false;  } -static int cgroup_is_releasable(const struct cgroup *cgrp) -{ -	const int bits = -		(1 << CGRP_RELEASABLE) | -		(1 << CGRP_NOTIFY_ON_RELEASE); -	return (cgrp->flags & bits) == bits; -} -  static int notify_on_release(const struct cgroup *cgrp)  {  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -394,12 +384,7 @@ static int notify_on_release(const struct cgroup *cgrp)  			;						\  		else -/* the list of cgroups eligible for automatic release. Protected by - * release_list_lock */ -static LIST_HEAD(release_list); -static DEFINE_RAW_SPINLOCK(release_list_lock);  static void cgroup_release_agent(struct work_struct *work); -static DECLARE_WORK(release_agent_work, cgroup_release_agent);  static void check_for_release(struct cgroup *cgrp);  /* @@ -498,7 +483,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])  	return key;  } -static void put_css_set_locked(struct css_set *cset, bool taskexit) +static void put_css_set_locked(struct css_set *cset)  {  	struct cgrp_cset_link *link, *tmp_link;  	struct cgroup_subsys *ss; @@ -524,11 +509,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)  		/* @cgrp can't go away while we're holding css_set_rwsem */  		if (list_empty(&cgrp->cset_links)) {  			cgroup_update_populated(cgrp, false); -			if (notify_on_release(cgrp)) { -				if (taskexit) -					set_bit(CGRP_RELEASABLE, &cgrp->flags); -				check_for_release(cgrp); -			} +			check_for_release(cgrp);  		}  		kfree(link); @@ -537,7 +518,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)  	kfree_rcu(cset, rcu_head);  } -static void put_css_set(struct css_set *cset, bool taskexit) +static void put_css_set(struct css_set *cset)  {  	/*  	 * Ensure that the refcount doesn't hit zero while any readers @@ -548,7 +529,7 @@ static void put_css_set(struct css_set *cset, bool taskexit)  		return;  	down_write(&css_set_rwsem); -	put_css_set_locked(cset, taskexit); +	put_css_set_locked(cset);  	up_write(&css_set_rwsem);  } @@ -969,14 +950,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,   * knows that the cgroup won't be removed, as cgroup_rmdir()   * needs that mutex.   * - * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't - * (usually) take cgroup_mutex.  These are the two most performance - * critical pieces of code here.  The exception occurs on cgroup_exit(), - * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex - * is taken, and if the cgroup count is zero, a usermode call made - * to the release agent with the name of the cgroup (path relative to - * the root of cgroup file system) as the argument. - *   * A cgroup can only be deleted if both its 'count' of using tasks   * is zero, and its list of 'children' cgroups is empty.  Since all   * tasks in the system use _some_ cgroup, and since there is always at @@ -1035,6 +1008,11 @@ static void cgroup_get(struct cgroup *cgrp)  	css_get(&cgrp->self);  } +static bool cgroup_tryget(struct cgroup *cgrp) +{ +	return css_tryget(&cgrp->self); +} +  static void cgroup_put(struct cgroup *cgrp)  {  	css_put(&cgrp->self); @@ -1147,7 +1125,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)  	 * protection against removal.  Ensure @cgrp stays accessible and  	 * break the active_ref protection.  	 */ -	cgroup_get(cgrp); +	if (!cgroup_tryget(cgrp)) +		return NULL;  	kernfs_break_active_protection(kn);  	mutex_lock(&cgroup_mutex); @@ -1581,7 +1560,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)  	INIT_LIST_HEAD(&cgrp->self.sibling);  	INIT_LIST_HEAD(&cgrp->self.children);  	INIT_LIST_HEAD(&cgrp->cset_links); -	INIT_LIST_HEAD(&cgrp->release_list);  	INIT_LIST_HEAD(&cgrp->pidlists);  	mutex_init(&cgrp->pidlist_mutex);  	cgrp->self.cgroup = cgrp; @@ -1591,6 +1569,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)  		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);  	init_waitqueue_head(&cgrp->offline_waitq); +	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);  }  static void init_cgroup_root(struct cgroup_root *root, @@ -1628,7 +1607,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)  		goto out;  	root_cgrp->id = ret; -	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); +	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, +			      GFP_KERNEL);  	if (ret)  		goto out; @@ -2046,8 +2026,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,  	 * task. As trading it for new_cset is protected by cgroup_mutex,  	 * we're safe to drop it here; it will be freed under RCU.  	 */ -	set_bit(CGRP_RELEASABLE, &old_cgrp->flags); -	put_css_set_locked(old_cset, false); +	put_css_set_locked(old_cset);  }  /** @@ -2068,7 +2047,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)  		cset->mg_src_cgrp = NULL;  		cset->mg_dst_cset = NULL;  		list_del_init(&cset->mg_preload_node); -		put_css_set_locked(cset, false); +		put_css_set_locked(cset);  	}  	up_write(&css_set_rwsem);  } @@ -2162,8 +2141,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,  		if (src_cset == dst_cset) {  			src_cset->mg_src_cgrp = NULL;  			list_del_init(&src_cset->mg_preload_node); -			put_css_set(src_cset, false); -			put_css_set(dst_cset, false); +			put_css_set(src_cset); +			put_css_set(dst_cset);  			continue;  		} @@ -2172,7 +2151,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,  		if (list_empty(&dst_cset->mg_preload_node))  			list_add(&dst_cset->mg_preload_node, &csets);  		else -			put_css_set(dst_cset, false); +			put_css_set(dst_cset);  	}  	list_splice_tail(&csets, preloaded_csets); @@ -3271,8 +3250,17 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)  {  	struct cftype *cft; -	for (cft = cfts; cft && cft->name[0] != '\0'; cft++) -		cft->flags |= __CFTYPE_NOT_ON_DFL; +	/* +	 * If legacy_flies_on_dfl, we want to show the legacy files on the +	 * dfl hierarchy but iff the target subsystem hasn't been updated +	 * for the dfl hierarchy yet. +	 */ +	if (!cgroup_legacy_files_on_dfl || +	    ss->dfl_cftypes != ss->legacy_cftypes) { +		for (cft = cfts; cft && cft->name[0] != '\0'; cft++) +			cft->flags |= __CFTYPE_NOT_ON_DFL; +	} +  	return cgroup_add_cftypes(ss, cfts);  } @@ -3970,7 +3958,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,  	l = cgroup_pidlist_find_create(cgrp, type);  	if (!l) { -		mutex_unlock(&cgrp->pidlist_mutex);  		pidlist_free(array);  		return -ENOMEM;  	} @@ -4159,7 +4146,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,  static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,  					  struct cftype *cft, u64 val)  { -	clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);  	if (val)  		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);  	else @@ -4337,6 +4323,7 @@ static void css_free_work_fn(struct work_struct *work)  		/* cgroup free path */  		atomic_dec(&cgrp->root->nr_cgrps);  		cgroup_pidlist_destroy_all(cgrp); +		cancel_work_sync(&cgrp->release_agent_work);  		if (cgroup_parent(cgrp)) {  			/* @@ -4387,6 +4374,15 @@ static void css_release_work_fn(struct work_struct *work)  		/* cgroup release path */  		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);  		cgrp->id = -1; + +		/* +		 * There are two control paths which try to determine +		 * cgroup from dentry without going through kernfs - +		 * cgroupstats_build() and css_tryget_online_from_dir(). +		 * Those are supported by RCU protecting clearing of +		 * cgrp->kn->priv backpointer. +		 */ +		RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);  	}  	mutex_unlock(&cgroup_mutex); @@ -4487,7 +4483,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,  	init_and_link_css(css, ss, cgrp); -	err = percpu_ref_init(&css->refcnt, css_release); +	err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);  	if (err)  		goto err_free_css; @@ -4543,6 +4539,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,  	struct cftype *base_files;  	int ssid, ret; +	/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. +	 */ +	if (strchr(name, '\n')) +		return -EINVAL; +  	parent = cgroup_kn_lock_live(parent_kn);  	if (!parent)  		return -ENODEV; @@ -4555,7 +4556,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,  		goto out_unlock;  	} -	ret = percpu_ref_init(&cgrp->self.refcnt, css_release); +	ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);  	if (ret)  		goto out_free_cgrp; @@ -4785,19 +4786,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)  	for_each_css(css, ssid, cgrp)  		kill_css(css); -	/* CSS_ONLINE is clear, remove from ->release_list for the last time */ -	raw_spin_lock(&release_list_lock); -	if (!list_empty(&cgrp->release_list)) -		list_del_init(&cgrp->release_list); -	raw_spin_unlock(&release_list_lock); -  	/*  	 * Remove @cgrp directory along with the base files.  @cgrp has an  	 * extra ref on its kn.  	 */  	kernfs_remove(cgrp->kn); -	set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);  	check_for_release(cgroup_parent(cgrp));  	/* put the base reference */ @@ -4814,23 +4808,10 @@ static int cgroup_rmdir(struct kernfs_node *kn)  	cgrp = cgroup_kn_lock_live(kn);  	if (!cgrp)  		return 0; -	cgroup_get(cgrp);	/* for @kn->priv clearing */  	ret = cgroup_destroy_locked(cgrp);  	cgroup_kn_unlock(kn); - -	/* -	 * There are two control paths which try to determine cgroup from -	 * dentry without going through kernfs - cgroupstats_build() and -	 * css_tryget_online_from_dir().  Those are supported by RCU -	 * protecting clearing of cgrp->kn->priv backpointer, which should -	 * happen after all files under it have been removed. -	 */ -	if (!ret) -		RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); - -	cgroup_put(cgrp);  	return ret;  } @@ -5034,12 +5015,9 @@ core_initcall(cgroup_wq_init);   *  - Print task's cgroup paths into seq_file, one line for each hierarchy   *  - Used for /proc/<pid>/cgroup.   */ - -/* TODO: Use a proper seq_file iterator */ -int proc_cgroup_show(struct seq_file *m, void *v) +int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, +		     struct pid *pid, struct task_struct *tsk)  { -	struct pid *pid; -	struct task_struct *tsk;  	char *buf, *path;  	int retval;  	struct cgroup_root *root; @@ -5049,14 +5027,6 @@ int proc_cgroup_show(struct seq_file *m, void *v)  	if (!buf)  		goto out; -	retval = -ESRCH; -	pid = m->private; -	tsk = get_pid_task(pid, PIDTYPE_PID); -	if (!tsk) -		goto out_free; - -	retval = 0; -  	mutex_lock(&cgroup_mutex);  	down_read(&css_set_rwsem); @@ -5086,11 +5056,10 @@ int proc_cgroup_show(struct seq_file *m, void *v)  		seq_putc(m, '\n');  	} +	retval = 0;  out_unlock:  	up_read(&css_set_rwsem);  	mutex_unlock(&cgroup_mutex); -	put_task_struct(tsk); -out_free:  	kfree(buf);  out:  	return retval; @@ -5161,7 +5130,7 @@ void cgroup_post_fork(struct task_struct *child)  	int i;  	/* -	 * This may race against cgroup_enable_task_cg_links().  As that +	 * This may race against cgroup_enable_task_cg_lists().  As that  	 * function sets use_task_css_set_links before grabbing  	 * tasklist_lock and we just went through tasklist_lock to add  	 * @child, it's guaranteed that either we see the set @@ -5176,7 +5145,7 @@ void cgroup_post_fork(struct task_struct *child)  	 * when implementing operations which need to migrate all tasks of  	 * a cgroup to another.  	 * -	 * Note that if we lose to cgroup_enable_task_cg_links(), @child +	 * Note that if we lose to cgroup_enable_task_cg_lists(), @child  	 * will remain in init_css_set.  This is safe because all tasks are  	 * in the init_css_set before cg_links is enabled and there's no  	 * operation which transfers all tasks out of init_css_set. @@ -5260,30 +5229,14 @@ void cgroup_exit(struct task_struct *tsk)  	}  	if (put_cset) -		put_css_set(cset, true); +		put_css_set(cset);  }  static void check_for_release(struct cgroup *cgrp)  { -	if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && -	    !css_has_online_children(&cgrp->self)) { -		/* -		 * Control Group is currently removeable. If it's not -		 * already queued for a userspace notification, queue -		 * it now -		 */ -		int need_schedule_work = 0; - -		raw_spin_lock(&release_list_lock); -		if (!cgroup_is_dead(cgrp) && -		    list_empty(&cgrp->release_list)) { -			list_add(&cgrp->release_list, &release_list); -			need_schedule_work = 1; -		} -		raw_spin_unlock(&release_list_lock); -		if (need_schedule_work) -			schedule_work(&release_agent_work); -	} +	if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && +	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) +		schedule_work(&cgrp->release_agent_work);  }  /* @@ -5311,52 +5264,39 @@ static void check_for_release(struct cgroup *cgrp)   */  static void cgroup_release_agent(struct work_struct *work)  { -	BUG_ON(work != &release_agent_work); +	struct cgroup *cgrp = +		container_of(work, struct cgroup, release_agent_work); +	char *pathbuf = NULL, *agentbuf = NULL, *path; +	char *argv[3], *envp[3]; +  	mutex_lock(&cgroup_mutex); -	raw_spin_lock(&release_list_lock); -	while (!list_empty(&release_list)) { -		char *argv[3], *envp[3]; -		int i; -		char *pathbuf = NULL, *agentbuf = NULL, *path; -		struct cgroup *cgrp = list_entry(release_list.next, -						    struct cgroup, -						    release_list); -		list_del_init(&cgrp->release_list); -		raw_spin_unlock(&release_list_lock); -		pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); -		if (!pathbuf) -			goto continue_free; -		path = cgroup_path(cgrp, pathbuf, PATH_MAX); -		if (!path) -			goto continue_free; -		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); -		if (!agentbuf) -			goto continue_free; - -		i = 0; -		argv[i++] = agentbuf; -		argv[i++] = path; -		argv[i] = NULL; - -		i = 0; -		/* minimal command environment */ -		envp[i++] = "HOME=/"; -		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; -		envp[i] = NULL; - -		/* Drop the lock while we invoke the usermode helper, -		 * since the exec could involve hitting disk and hence -		 * be a slow process */ -		mutex_unlock(&cgroup_mutex); -		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); -		mutex_lock(&cgroup_mutex); - continue_free: -		kfree(pathbuf); -		kfree(agentbuf); -		raw_spin_lock(&release_list_lock); -	} -	raw_spin_unlock(&release_list_lock); + +	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); +	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); +	if (!pathbuf || !agentbuf) +		goto out; + +	path = cgroup_path(cgrp, pathbuf, PATH_MAX); +	if (!path) +		goto out; + +	argv[0] = agentbuf; +	argv[1] = path; +	argv[2] = NULL; + +	/* minimal command environment */ +	envp[0] = "HOME=/"; +	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; +	envp[2] = NULL; +  	mutex_unlock(&cgroup_mutex); +	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); +	goto out_free; +out: +	mutex_unlock(&cgroup_mutex); +out_free: +	kfree(agentbuf); +	kfree(pathbuf);  }  static int __init cgroup_disable(char *str) @@ -5416,7 +5356,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,  	/*  	 * This path doesn't originate from kernfs and @kn could already  	 * have been or be removed at any point.  @kn->priv is RCU -	 * protected for this access.  See cgroup_rmdir() for details. +	 * protected for this access.  See css_release_work_fn() for details.  	 */  	cgrp = rcu_dereference(kn->priv);  	if (cgrp) @@ -5544,7 +5484,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)  static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)  { -	return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); +	return (!cgroup_has_tasks(css->cgroup) && +		!css_has_online_children(&css->cgroup->self));  }  static struct cftype debug_files[] =  { |