aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/cgroup-defs.h5
-rw-r--r--include/linux/cgroup.h23
-rw-r--r--include/linux/cpuset.h3
-rw-r--r--include/linux/sched/task.h4
-rw-r--r--include/uapi/linux/sched.h5
-rw-r--r--kernel/cgroup/cgroup.c358
-rw-r--r--kernel/cgroup/cpuset.c31
-rw-r--r--kernel/cgroup/pids.c15
-rw-r--r--kernel/fork.c19
-rw-r--r--kernel/power/process.c2
-rw-r--r--tools/testing/selftests/cgroup/Makefile6
-rw-r--r--tools/testing/selftests/cgroup/cgroup_util.c126
-rw-r--r--tools/testing/selftests/cgroup/cgroup_util.h4
-rw-r--r--tools/testing/selftests/cgroup/test_core.c177
-rw-r--r--tools/testing/selftests/clone3/clone3_selftests.h19
15 files changed, 661 insertions, 136 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 63097cb243cb..68c391f451d1 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -628,8 +628,9 @@ struct cgroup_subsys {
void (*cancel_attach)(struct cgroup_taskset *tset);
void (*attach)(struct cgroup_taskset *tset);
void (*post_attach)(void);
- int (*can_fork)(struct task_struct *task);
- void (*cancel_fork)(struct task_struct *task);
+ int (*can_fork)(struct task_struct *task,
+ struct css_set *cset);
+ void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
void (*fork)(struct task_struct *task);
void (*exit)(struct task_struct *task);
void (*release)(struct task_struct *task);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e75d2191226b..4598e4da6b1b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -27,6 +27,8 @@
#include <linux/cgroup-defs.h>
+struct kernel_clone_args;
+
#ifdef CONFIG_CGROUPS
/*
@@ -58,9 +60,6 @@ struct css_task_iter {
struct list_head *tcset_head;
struct list_head *task_pos;
- struct list_head *tasks_head;
- struct list_head *mg_tasks_head;
- struct list_head *dying_tasks_head;
struct list_head *cur_tasks_head;
struct css_set *cur_cset;
@@ -122,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk);
void cgroup_fork(struct task_struct *p);
-extern int cgroup_can_fork(struct task_struct *p);
-extern void cgroup_cancel_fork(struct task_struct *p);
-extern void cgroup_post_fork(struct task_struct *p);
+extern int cgroup_can_fork(struct task_struct *p,
+ struct kernel_clone_args *kargs);
+extern void cgroup_cancel_fork(struct task_struct *p,
+ struct kernel_clone_args *kargs);
+extern void cgroup_post_fork(struct task_struct *p,
+ struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);
@@ -708,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry) { return -EINVAL; }
static inline void cgroup_fork(struct task_struct *p) {}
-static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
-static inline void cgroup_cancel_fork(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline int cgroup_can_fork(struct task_struct *p,
+ struct kernel_clone_args *kargs) { return 0; }
+static inline void cgroup_cancel_fork(struct task_struct *p,
+ struct kernel_clone_args *kargs) {}
+static inline void cgroup_post_fork(struct task_struct *p,
+ struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 04c20de66afc..cede4cb98b78 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -54,7 +54,6 @@ extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
extern void cpuset_read_lock(void);
extern void cpuset_read_unlock(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
@@ -176,8 +175,6 @@ static inline void cpuset_update_active_cpus(void)
partition_sched_domains(1, NULL, NULL);
}
-static inline void cpuset_wait_for_hotplug(void) { }
-
static inline void cpuset_read_lock(void) { }
static inline void cpuset_read_unlock(void) { }
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index f1879884238e..38359071236a 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -13,6 +13,7 @@
struct task_struct;
struct rusage;
union thread_union;
+struct css_set;
/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL
@@ -29,6 +30,9 @@ struct kernel_clone_args {
pid_t *set_tid;
/* Number of elements in *set_tid */
size_t set_tid_size;
+ int cgroup;
+ struct cgroup *cgrp;
+ struct css_set *cset;
};
/*
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 2e3bc22c6f20..3bac0a8ceab2 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -35,6 +35,7 @@
/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
+#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
@@ -81,6 +82,8 @@
* @set_tid_size: This defines the size of the array referenced
* in @set_tid. This cannot be larger than the
* kernel's limit of nested PID namespaces.
+ * @cgroup: If CLONE_INTO_CGROUP is specified set this to
+ * a file descriptor for the cgroup.
*
* The structure is versioned by size and thus extensible.
* New struct members must go at the end of the struct and
@@ -97,11 +100,13 @@ struct clone_args {
__aligned_u64 tls;
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
+ __aligned_u64 cgroup;
};
#endif
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
+#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
/*
* Scheduling policies
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7a39dc882095..114dcbf8d4f4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2714,11 +2714,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
{
DEFINE_CGROUP_MGCTX(mgctx);
struct task_struct *task;
- int ret;
-
- ret = cgroup_migrate_vet_dst(dst_cgrp);
- if (ret)
- return ret;
+ int ret = 0;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
@@ -4148,7 +4144,8 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
} else if (likely(!(pos->flags & CSS_RELEASED))) {
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
} else {
- list_for_each_entry_rcu(next, &parent->children, sibling)
+ list_for_each_entry_rcu(next, &parent->children, sibling,
+ lockdep_is_held(&cgroup_mutex))
if (next->serial_nr > pos->serial_nr)
break;
}
@@ -4391,29 +4388,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock);
- /* Advance to the next non-empty css_set */
- do {
- cset = css_task_iter_next_css_set(it);
- if (!cset) {
- it->task_pos = NULL;
- return;
+ /* Advance to the next non-empty css_set and find first non-empty tasks list*/
+ while ((cset = css_task_iter_next_css_set(it))) {
+ if (!list_empty(&cset->tasks)) {
+ it->cur_tasks_head = &cset->tasks;
+ break;
+ } else if (!list_empty(&cset->mg_tasks)) {
+ it->cur_tasks_head = &cset->mg_tasks;
+ break;
+ } else if (!list_empty(&cset->dying_tasks)) {
+ it->cur_tasks_head = &cset->dying_tasks;
+ break;
}
- } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
-
- if (!list_empty(&cset->tasks)) {
- it->task_pos = cset->tasks.next;
- it->cur_tasks_head = &cset->tasks;
- } else if (!list_empty(&cset->mg_tasks)) {
- it->task_pos = cset->mg_tasks.next;
- it->cur_tasks_head = &cset->mg_tasks;
- } else {
- it->task_pos = cset->dying_tasks.next;
- it->cur_tasks_head = &cset->dying_tasks;
}
-
- it->tasks_head = &cset->tasks;
- it->mg_tasks_head = &cset->mg_tasks;
- it->dying_tasks_head = &cset->dying_tasks;
+ if (!cset) {
+ it->task_pos = NULL;
+ return;
+ }
+ it->task_pos = it->cur_tasks_head->next;
/*
* We don't keep css_sets locked across iteration steps and thus
@@ -4458,24 +4450,24 @@ static void css_task_iter_advance(struct css_task_iter *it)
repeat:
if (it->task_pos) {
/*
- * Advance iterator to find next entry. cset->tasks is
- * consumed first and then ->mg_tasks. After ->mg_tasks,
- * we move onto the next cset.
+ * Advance iterator to find next entry. We go through cset
+ * tasks, mg_tasks and dying_tasks, when consumed we move onto
+ * the next cset.
*/
if (it->flags & CSS_TASK_ITER_SKIPPED)
it->flags &= ~CSS_TASK_ITER_SKIPPED;
else
it->task_pos = it->task_pos->next;
- if (it->task_pos == it->tasks_head) {
- it->task_pos = it->mg_tasks_head->next;
- it->cur_tasks_head = it->mg_tasks_head;
+ if (it->task_pos == &it->cur_cset->tasks) {
+ it->cur_tasks_head = &it->cur_cset->mg_tasks;
+ it->task_pos = it->cur_tasks_head->next;
}
- if (it->task_pos == it->mg_tasks_head) {
- it->task_pos = it->dying_tasks_head->next;
- it->cur_tasks_head = it->dying_tasks_head;
+ if (it->task_pos == &it->cur_cset->mg_tasks) {
+ it->cur_tasks_head = &it->cur_cset->dying_tasks;
+ it->task_pos = it->cur_tasks_head->next;
}
- if (it->task_pos == it->dying_tasks_head)
+ if (it->task_pos == &it->cur_cset->dying_tasks)
css_task_iter_advance_css_set(it);
} else {
/* called from start, proceed to the first cset */
@@ -4493,12 +4485,12 @@ repeat:
goto repeat;
/* and dying leaders w/o live member threads */
- if (it->cur_tasks_head == it->dying_tasks_head &&
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
!atomic_read(&task->signal->live))
goto repeat;
} else {
/* skip all dying ones */
- if (it->cur_tasks_head == it->dying_tasks_head)
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
goto repeat;
}
}
@@ -4662,13 +4654,28 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
return 0;
}
+static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
+{
+ int ret;
+ struct inode *inode;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ return ret;
+}
+
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
struct super_block *sb)
{
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *com_cgrp = src_cgrp;
- struct inode *inode;
int ret;
lockdep_assert_held(&cgroup_mutex);
@@ -4678,12 +4685,7 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
com_cgrp = cgroup_parent(com_cgrp);
/* %current should be authorized to migrate to the common ancestor */
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
- if (!inode)
- return -ENOMEM;
-
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
+ ret = cgroup_may_write(com_cgrp, sb);
if (ret)
return ret;
@@ -4699,6 +4701,26 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
return 0;
}
+static int cgroup_attach_permissions(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb, bool threadgroup)
+{
+ int ret = 0;
+
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
+ if (ret)
+ return ret;
+
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
+ if (ret)
+ return ret;
+
+ if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
+ ret = -EOPNOTSUPP;
+
+ return ret;
+}
+
static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
@@ -4721,8 +4743,8 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb);
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb, true);
if (ret)
goto out_finish;
@@ -4766,16 +4788,11 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
spin_unlock_irq(&css_set_lock);
/* thread migrations follow the cgroup.procs delegation rule */
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb);
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb, false);
if (ret)
goto out_finish;
- /* and must be contained in the same domain */
- ret = -EOPNOTSUPP;
- if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
- goto out_finish;
-
ret = cgroup_attach_task(dst_cgrp, task, false);
out_finish:
@@ -5864,8 +5881,7 @@ out:
* @child: pointer to task_struct of forking parent process.
*
* A task is associated with the init_css_set until cgroup_post_fork()
- * attaches it to the parent's css_set. Empty cg_list indicates that
- * @child isn't holding reference to its css_set.
+ * attaches it to the target css_set.
*/
void cgroup_fork(struct task_struct *child)
{
@@ -5873,21 +5889,172 @@ void cgroup_fork(struct task_struct *child)
INIT_LIST_HEAD(&child->cg_list);
}
+static struct cgroup *cgroup_get_from_file(struct file *f)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *cgrp;
+
+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ if (IS_ERR(css))
+ return ERR_CAST(css);
+
+ cgrp = css->cgroup;
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+
+ return cgrp;
+}
+
+/**
+ * cgroup_css_set_fork - find or create a css_set for a child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * This functions finds or creates a new css_set which the child
+ * process will be attached to in cgroup_post_fork(). By default,
+ * the child process will be given the same css_set as its parent.
+ *
+ * If CLONE_INTO_CGROUP is specified this function will try to find an
+ * existing css_set which includes the requested cgroup and if not create
+ * a new css_set that the child will be attached to later. If this function
+ * succeeds it will hold cgroup_threadgroup_rwsem on return. If
+ * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
+ * before grabbing cgroup_threadgroup_rwsem and will hold a reference
+ * to the target cgroup.
+ */
+static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
+ __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
+{
+ int ret;
+ struct cgroup *dst_cgrp = NULL;
+ struct css_set *cset;
+ struct super_block *sb;
+ struct file *f;
+
+ if (kargs->flags & CLONE_INTO_CGROUP)
+ mutex_lock(&cgroup_mutex);
+
+ cgroup_threadgroup_change_begin(current);
+
+ spin_lock_irq(&css_set_lock);
+ cset = task_css_set(current);
+ get_css_set(cset);
+ spin_unlock_irq(&css_set_lock);
+
+ if (!(kargs->flags & CLONE_INTO_CGROUP)) {
+ kargs->cset = cset;
+ return 0;
+ }
+
+ f = fget_raw(kargs->cgroup);
+ if (!f) {
+ ret = -EBADF;
+ goto err;
+ }
+ sb = f->f_path.dentry->d_sb;
+
+ dst_cgrp = cgroup_get_from_file(f);
+ if (IS_ERR(dst_cgrp)) {
+ ret = PTR_ERR(dst_cgrp);
+ dst_cgrp = NULL;
+ goto err;
+ }
+
+ if (cgroup_is_dead(dst_cgrp)) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ /*
+ * Verify that we the target cgroup is writable for us. This is
+ * usually done by the vfs layer but since we're not going through
+ * the vfs layer here we need to do it "manually".
+ */
+ ret = cgroup_may_write(dst_cgrp, sb);
+ if (ret)
+ goto err;
+
+ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
+ !(kargs->flags & CLONE_THREAD));
+ if (ret)
+ goto err;
+
+ kargs->cset = find_css_set(cset, dst_cgrp);
+ if (!kargs->cset) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ put_css_set(cset);
+ fput(f);
+ kargs->cgrp = dst_cgrp;
+ return ret;
+
+err:
+ cgroup_threadgroup_change_end(current);
+ mutex_unlock(&cgroup_mutex);
+ if (f)
+ fput(f);
+ if (dst_cgrp)
+ cgroup_put(dst_cgrp);
+ put_css_set(cset);
+ if (kargs->cset)
+ put_css_set(kargs->cset);
+ return ret;
+}
+
+/**
+ * cgroup_css_set_put_fork - drop references we took during fork
+ * @kargs: the arguments passed to create the child process
+ *
+ * Drop references to the prepared css_set and target cgroup if
+ * CLONE_INTO_CGROUP was requested.
+ */
+static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+{
+ cgroup_threadgroup_change_end(current);
+
+ if (kargs->flags & CLONE_INTO_CGROUP) {
+ struct cgroup *cgrp = kargs->cgrp;
+ struct css_set *cset = kargs->cset;
+
+ mutex_unlock(&cgroup_mutex);
+
+ if (cset) {
+ put_css_set(cset);
+ kargs->cset = NULL;
+ }
+
+ if (cgrp) {
+ cgroup_put(cgrp);
+ kargs->cgrp = NULL;
+ }
+ }
+}
+
/**
* cgroup_can_fork - called on a new task before the process is exposed
- * @child: the task in question.
+ * @child: the child process
*
- * This calls the subsystem can_fork() callbacks. If the can_fork() callback
- * returns an error, the fork aborts with that error code. This allows for
- * a cgroup subsystem to conditionally allow or deny new forks.
+ * This prepares a new css_set for the child process which the child will
+ * be attached to in cgroup_post_fork().
+ * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
+ * callback returns an error, the fork aborts with that error code. This
+ * allows for a cgroup subsystem to conditionally allow or deny new forks.
*/
-int cgroup_can_fork(struct task_struct *child)
+int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i, j, ret;
+ ret = cgroup_css_set_fork(kargs);
+ if (ret)
+ return ret;
+
do_each_subsys_mask(ss, i, have_canfork_callback) {
- ret = ss->can_fork(child);
+ ret = ss->can_fork(child, kargs->cset);
if (ret)
goto out_revert;
} while_each_subsys_mask();
@@ -5899,54 +6066,64 @@ out_revert:
if (j >= i)
break;
if (ss->cancel_fork)
- ss->cancel_fork(child);
+ ss->cancel_fork(child, kargs->cset);
}
+ cgroup_css_set_put_fork(kargs);
+
return ret;
}
/**
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
- * @child: the task in question
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This calls the cancel_fork() callbacks if a fork failed *after*
- * cgroup_can_fork() succeded.
+ * cgroup_can_fork() succeded and cleans up references we took to
+ * prepare a new css_set for the child process in cgroup_can_fork().
*/
-void cgroup_cancel_fork(struct task_struct *child)
+void cgroup_cancel_fork(struct task_struct *child,
+ struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
- ss->cancel_fork(child);
+ ss->cancel_fork(child, kargs->cset);
+
+ cgroup_css_set_put_fork(kargs);
}
/**
- * cgroup_post_fork - called on a new task after adding it to the task list
- * @child: the task in question
- *
- * Adds the task to the list running through its css_set if necessary and
- * call the subsystem fork() callbacks. Has to be after the task is
- * visible on the task list in case we race with the first call to
- * cgroup_task_iter_start() - to guarantee that the new task ends up on its
- * list.
+ * cgroup_post_fork - finalize cgroup setup for the child process
+ * @child: the child process
+ *
+ * Attach the child process to its css_set calling the subsystem fork()
+ * callbacks.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+ struct kernel_clone_args *kargs)
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
struct cgroup_subsys *ss;
struct css_set *cset;
int i;
+ cset = kargs->cset;
+ kargs->cset = NULL;
+
spin_lock_irq(&css_set_lock);
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
WARN_ON_ONCE(!list_empty(&child->cg_list));
- cset = task_css_set(current); /* current is @child's parent */
- get_css_set(cset);
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
+ } else {
+ put_css_set(cset);
+ cset = NULL;
}
/*
@@ -5978,6 +6155,17 @@ void cgroup_post_fork(struct task_struct *child)
do_each_subsys_mask(ss, i, have_fork_callback) {
ss->fork(child);
} while_each_subsys_mask();
+
+ /* Make the new cset the root_cset of the new cgroup namespace. */
+ if (kargs->flags & CLONE_NEWCGROUP) {
+ struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
+
+ get_css_set(cset);
+ child->nsproxy->cgroup_ns->root_cset = cset;
+ put_css_set(rcset);
+ }
+
+ cgroup_css_set_put_fork(kargs);
}
/**
@@ -6164,7 +6352,6 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
*/
struct cgroup *cgroup_get_from_fd(int fd)
{
- struct cgroup_subsys_state *css;
struct cgroup *cgrp;
struct file *f;
@@ -6172,17 +6359,8 @@ struct cgroup *cgroup_get_from_fd(int fd)
if (!f)
return ERR_PTR(-EBADF);
- css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ cgrp = cgroup_get_from_file(f);
fput(f);
- if (IS_ERR(css))
- return ERR_CAST(css);
-
- cgrp = css->cgroup;
- if (!cgroup_on_dfl(cgrp)) {
- cgroup_put(cgrp);
- return ERR_PTR(-EBADF);
- }
-
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 58f5073acff7..cafd4d2ff882 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3101,7 +3101,7 @@ update_tasks:
}
/**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset
*
* This function is called after either CPU or memory configuration has
* changed and updates cpuset accordingly. The top_cpuset is always
@@ -3116,7 +3116,7 @@ update_tasks:
* Note that CPU offlining during suspend is ignored. We don't modify
* cpusets across suspend/resume cycles at all.
*/
-static void cpuset_hotplug_workfn(struct work_struct *work)
+static void cpuset_hotplug(bool use_cpu_hp_lock)
{
static cpumask_t new_cpus;
static nodemask_t new_mems;
@@ -3201,25 +3201,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* rebuild sched domains if cpus_allowed has changed */
if (cpus_updated || force_rebuild) {
force_rebuild = false;
- rebuild_sched_domains();
+ if (use_cpu_hp_lock)
+ rebuild_sched_domains();
+ else {
+ /* Acquiring cpu_hotplug_lock is not required.
+ * When cpuset_hotplug() is called in hotplug path,
+ * cpu_hotplug_lock is held by the hotplug context
+ * which is waiting for cpuhp_thread_fun to indicate
+ * completion of callback.
+ */
+ percpu_down_write(&cpuset_rwsem);
+ rebuild_sched_domains_locked();
+ percpu_up_write(&cpuset_rwsem);
+ }
}
free_cpumasks(NULL, ptmp);
}
-void cpuset_update_active_cpus(void)
+static void cpuset_hotplug_workfn(struct work_struct *work)
{
- /*
- * We're inside cpu hotplug critical region which usually nests
- * inside cgroup synchronization. Bounce actual hotplug processing
- * to a work item to avoid reverse locking order.
- */
- schedule_work(&cpuset_hotplug_work);
+ cpuset_hotplug(true);
}
-void cpuset_wait_for_hotplug(void)
+void cpuset_update_active_cpus(void)
{
- flush_work(&cpuset_hotplug_work);
+ cpuset_hotplug(false);
}
/*
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 138059eb730d..511af87f685e 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -33,6 +33,7 @@
#include <linux/atomic.h>
#include <linux/cgroup.h>
#include <linux/slab.h>
+#include <linux/sched/task.h>
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"
@@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on cgroup_threadgroup_change_begin() held by the copy_process().
*/
-static int pids_can_fork(struct task_struct *task)
+static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
int err;
- css = task_css_check(current, pids_cgrp_id, true);
+ if (cset)
+ css = cset->subsys[pids_cgrp_id];
+ else
+ css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
err = pids_try_charge(pids, 1);
if (err) {
@@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task)
return err;
}
-static void pids_cancel_fork(struct task_struct *task)
+static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
- css = task_css_check(current, pids_cgrp_id, true);
+ if (cset)
+ css = cset->subsys[pids_cgrp_id];
+ else
+ css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
pids_uncharge(pids, 1);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 60a1295f4384..635d6369dfb9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2174,16 +2174,15 @@ static __latent_entropy struct task_struct *copy_process(
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
- cgroup_threadgroup_change_begin(current);
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
- retval = cgroup_can_fork(p);
+ retval = cgroup_can_fork(p, args);
if (retval)
- goto bad_fork_cgroup_threadgroup_change_end;
+ goto bad_fork_put_pidfd;
/*
* From this point on we must avoid any synchronous user-space
@@ -2288,8 +2287,7 @@ static __latent_entropy struct task_struct *copy_process(
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
- cgroup_threadgroup_change_end(current);
+ cgroup_post_fork(p, args);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
@@ -2300,9 +2298,7 @@ static __latent_entropy struct task_struct *copy_process(
bad_fork_cancel_cgroup:
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
- cgroup_cancel_fork(p);
-bad_fork_cgroup_threadgroup_change_end:
- cgroup_threadgroup_change_end(current);
+ cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) {
fput(pidfile);
@@ -2631,6 +2627,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
!valid_signal(args.exit_signal)))
return -EINVAL;
+ if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
+ return -EINVAL;
+
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
@@ -2641,6 +2640,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
+ .cgroup = args.cgroup,
};
if (args.set_tid &&
@@ -2684,7 +2684,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
- if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
+ if (kargs->flags &
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
return false;
/*
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 4b6a54da7e65..08f7019357ee 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -204,8 +204,6 @@ void thaw_processes(void)
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
- cpuset_wait_for_hotplug();
-
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile
index 66aafe1f5746..967f268fde74 100644
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -11,6 +11,6 @@ TEST_GEN_PROGS += test_freezer
include ../lib.mk
-$(OUTPUT)/test_memcontrol: cgroup_util.c
-$(OUTPUT)/test_core: cgroup_util.c
-$(OUTPUT)/test_freezer: cgroup_util.c
+$(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
+$(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
+$(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c
index 8f7131dcf1ff..8a637ca7d73a 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@@ -15,6 +15,7 @@
#include <unistd.h>
#include "cgroup_util.h"
+#include "../clone3/clone3_selftests.h"
static ssize_t read_text(const char *path, char *buf, size_t max_len)
{
@@ -331,12 +332,112 @@ int cg_run(const char *cgroup,
}
}
+pid_t clone_into_cgroup(int cgroup_fd)
+{
+#ifdef CLONE_ARGS_SIZE_VER2
+ pid_t pid;
+
+ struct clone_args args = {
+ .flags = CLONE_INTO_CGROUP,
+ .exit_signal = SIGCHLD,
+ .cgroup = cgroup_fd,
+ };
+
+ pid = sys_clone3(&args, sizeof(struct clone_args));
+ /*
+ * Verify that this is a genuine test failure:
+ * ENOSYS -> clone3() not available
+ * E2BIG -> CLONE_INTO_CGROUP not available
+ */
+ if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
+ goto pretend_enosys;
+
+ return pid;
+
+pretend_enosys:
+#endif
+ errno = ENOSYS;
+ return -ENOSYS;
+}
+
+int clone_reap(pid_t pid, int options)
+{
+ int ret;
+ siginfo_t info = {
+ .si_signo = 0,
+ };
+
+again:
+ ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
+ if (ret < 0) {
+ if (errno == EINTR)
+ goto again;
+ return -1;
+ }
+
+ if (options & WEXITED) {
+ if (WIFEXITED(info.si_status))
+ return WEXITSTATUS(info.si_status);
+ }
+
+ if (options & WSTOPPED) {
+ if (WIFSTOPPED(info.si_status))
+ return WSTOPSIG(info.si_status);
+ }
+
+ if (options & WCONTINUED) {
+ if (WIFCONTINUED(info.si_status))
+ return 0;
+ }
+
+ return -1;
+}
+
+int dirfd_open_opath(const char *dir)
+{
+ return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
+}
+
+#define close_prot_errno(fd) \
+ if (fd >= 0) { \
+ int _e_ = errno; \
+ close(fd); \
+ errno = _e_; \
+ }
+
+static int clone_into_cgroup_run_nowait(const char *cgroup,
+ int (*fn)(const char *cgroup, void *arg),
+ void *arg)
+{
+ int cgroup_fd;
+ pid_t pid;
+
+ cgroup_fd = dirfd_open_opath(cgroup);
+ if (cgroup_fd < 0)
+ return -1;
+
+ pid = clone_into_cgroup(cgroup_fd);
+ close_prot_errno(cgroup_fd);
+ if (pid == 0)
+ exit(fn(cgroup, arg));
+
+ return pid;
+}
+
int cg_run_nowait(const char *cgroup,
int (*fn)(const char *cgroup, void *arg),
void *arg)
{
int pid;
+ pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
+ if (pid > 0)
+ return pid;
+
+ /* Genuine test failure. */
+ if (pid < 0 && errno != ENOSYS)
+ return -1;
+
pid = fork();
if (pid == 0) {
char buf[64];
@@ -450,3 +551,28 @@ int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
return strstr(buf, needle) ? 0 : -1;
}
+
+int clone_into_cgroup_run_wait(const char *cgroup)
+{
+ int cgroup_fd;
+ pid_t pid;
+
+ cgroup_fd = dirfd_open_opath(cgroup);
+ if (cgroup_fd < 0)
+ return -1;
+
+ pid = clone_into_cgroup(cgroup_fd);
+ close_prot_errno(cgroup_fd);
+ if (pid < 0)
+ return -1;
+
+ if (pid == 0)
+ exit(EXIT_SUCCESS);
+
+ /*
+ * We don't care whether this fails. We only care whether the initial
+ * clone succeeded.
+ */
+ (void)clone_reap(pid, WEXITED);
+ return 0;
+}
diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h
index 49c54fbdb229..5a1305dd1f0b 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/cgroup_util.h
@@ -50,3 +50,7 @@ extern int cg_wait_for_proc_count(const char *cgroup, int count);
extern int cg_killall(const char *cgroup);
extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
+extern pid_t clone_into_cgroup(int cgroup_fd);
+extern int clone_reap(pid_t pid, int options);
+extern int clone_into_cgroup_run_wait(const char *cgroup);
+extern int dirfd_open_opath(const char *dir);
diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index e19ce940cd6a..3df648c37876 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -2,7 +2,10 @@
#include <linux/limits.h>
#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
#include <unistd.h>
+#include <fcntl.h>
#include <stdio.h>
#include <errno.h>
#include <signal.h>
@@ -12,6 +15,115 @@
#include "../kselftest.h"
#include "cgroup_util.h"
+static int touch_anon(char *buf, size_t size)
+{
+ int fd;
+ char *pos = buf;
+
+ fd = open("/dev/urandom", O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ while (size > 0) {
+ ssize_t ret = read(fd, pos, size);
+
+ if (ret < 0) {
+ if (errno != EINTR) {
+ close(fd);
+ return -1;
+ }
+ } else {
+ pos += ret;
+ size -= ret;
+ }
+ }
+ close(fd);
+
+ return 0;
+}
+
+static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg)
+{
+ int ppid = getppid();
+ size_t size = (size_t)arg;
+ void *buf;
+
+ buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+ 0, 0);
+ if (buf == MAP_FAILED)
+ return -1;
+
+ if (touch_anon((char *)buf, size)) {
+ munmap(buf, size);
+ return -1;
+ }
+
+ while (getppid() == ppid)
+ sleep(1);
+
+ munmap(buf, size);
+ return 0;
+}
+
+/*
+ * Create a child process that allocates and touches 100MB, then waits to be
+ * killed. Wait until the child is attached to the cgroup, kill all processes
+ * in that cgroup and wait until "cgroup.procs" is empty. At this point try to
+ * destroy the empty cgroup. The test helps detect race conditions between
+ * dying processes leaving the cgroup and cgroup destruction path.
+ */
+static int test_cgcore_destroy(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg_test = NULL;
+ int child_pid;
+ char buf[PAGE_SIZE];
+
+ cg_test = cg_name(root, "cg_test");
+
+ if (!cg_test)
+ goto cleanup;
+
+ for (int i = 0; i < 10; i++) {
+ if (cg_create(cg_test))
+ goto cleanup;
+
+ child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit,
+ (void *) MB(100));
+
+ if (child_pid < 0)
+ goto cleanup;
+
+ /* wait for the child to enter cgroup */
+ if (cg_wait_for_proc_count(cg_test, 1))
+ goto cleanup;
+
+ if (cg_killall(cg_test))
+ goto cleanup;
+
+ /* wait for cgroup to be empty */
+ while (1) {
+ if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf)))
+ goto cleanup;
+ if (buf[0] == '\0')
+ break;
+ usleep(1000);
+ }
+
+ if (rmdir(cg_test))
+ goto cleanup;
+
+ if (waitpid(child_pid, NULL, 0) < 0)
+ goto cleanup;
+ }
+ ret = KSFT_PASS;
+cleanup:
+ if (cg_test)
+ cg_destroy(cg_test);
+ free(cg_test);
+ return ret;
+}
+
/*
* A(0) - B(0) - C(1)
* \ D(0)
@@ -25,8 +137,11 @@
static int test_cgcore_populated(const char *root)
{
int ret = KSFT_FAIL;
+ int err;
char *cg_test_a = NULL, *cg_test_b = NULL;
char *cg_test_c = NULL, *cg_test_d = NULL;
+ int cgroup_fd = -EBADF;
+ pid_t pid;
cg_test_a = cg_name(root, "cg_test_a");
cg_test_b = cg_name(root, "cg_test_a/cg_test_b");
@@ -78,6 +193,52 @@ static int test_cgcore_populated(const char *root)
if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
goto cleanup;
+ /* Test that we can directly clone into a new cgroup. */
+ cgroup_fd = dirfd_open_opath(cg_test_d);
+ if (cgroup_fd < 0)
+ goto cleanup;
+
+ pid = clone_into_cgroup(cgroup_fd);
+ if (pid < 0) {
+ if (errno == ENOSYS)
+ goto cleanup_pass;
+ goto cleanup;
+ }
+
+ if (pid == 0) {
+ if (raise(SIGSTOP))
+ exit(EXIT_FAILURE);
+ exit(EXIT_SUCCESS);
+ }
+
+ err = cg_read_strcmp(cg_test_d, "cgroup.events", "populated 1\n");
+
+ (void)clone_reap(pid, WSTOPPED);
+ (void)kill(pid, SIGCONT);
+ (void)clone_reap(pid, WEXITED);
+
+ if (err)
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
+ goto cleanup;
+
+ /* Remove cgroup. */
+ if (cg_test_d) {
+ cg_destroy(cg_test_d);
+ free(cg_test_d);
+ cg_test_d = NULL;
+ }
+
+ pid = clone_into_cgroup(cgroup_fd);
+ if (pid < 0)
+ goto cleanup_pass;
+ if (pid == 0)
+ exit(EXIT_SUCCESS);
+ (void)clone_reap(pid, WEXITED);
+ goto cleanup;
+
+cleanup_pass:
ret = KSFT_PASS;
cleanup:
@@ -93,6 +254,8 @@ cleanup:
free(cg_test_c);
free(cg_test_b);
free(cg_test_a);
+ if (cgroup_fd >= 0)
+ close(cgroup_fd);
return ret;
}
@@ -136,6 +299,16 @@ static int test_cgcore_invalid_domain(const char *root)
if (errno != EOPNOTSUPP)
goto cleanup;
+ if (!clone_into_cgroup_run_wait(child))
+ goto cleanup;
+
+ if (errno == ENOSYS)
+ goto cleanup_pass;
+
+ if (errno != EOPNOTSUPP)
+ goto cleanup;
+
+cleanup_pass:
ret = KSFT_PASS;
cleanup:
@@ -345,6 +518,9 @@ static int test_cgcore_internal_process_constraint(const char *root)
if (!cg_enter_current(parent))
goto cleanup;
+ if (!clone_into_cgroup_run_wait(parent))
+ goto cleanup;
+
ret = KSFT_PASS;
cleanup:
@@ -512,6 +688,7 @@ struct corecg_test {
T(test_cgcore_populated),
T(test_cgcore_proc_migration),
T(test_cgcore_thread_migration),
+ T(test_cgcore_destroy),
};
#undef T
diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h
index a3f2c8ad8bcc..91c1a78ddb39 100644
--- a/tools/testing/selftests/clone3/clone3_selftests.h
+++ b/tools/testing/selftests/clone3/clone3_selftests.h
@@ -5,12 +5,24 @@
#define _GNU_SOURCE
#include <sched.h>
+#include <linux/sched.h>
+#include <linux/types.h>
#include <stdint.h>
#include <syscall.h>
-#include <linux/types.h>
+#include <sys/wait.h>
+
+#include "../kselftest.h"
#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
+#ifndef CLONE_INTO_CGROUP
+#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
+#endif
+
+#ifndef CLONE_ARGS_SIZE_VER0
+#define CLONE_ARGS_SIZE_VER0 64
+#endif
+
#ifndef __NR_clone3
#define __NR_clone3 -1
struct clone_args {
@@ -22,10 +34,13 @@ struct clone_args {
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
+#define CLONE_ARGS_SIZE_VER1 80
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
+#define CLONE_ARGS_SIZE_VER2 88
+ __aligned_u64 cgroup;
};
-#endif
+#endif /* __NR_clone3 */
static pid_t sys_clone3(struct clone_args *args, size_t size)
{