diff options
Diffstat (limited to 'kernel/cgroup')
| -rw-r--r-- | kernel/cgroup/cgroup-v1.c | 17 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 141 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 56 | ||||
| -rw-r--r-- | kernel/cgroup/misc.c | 31 | ||||
| -rw-r--r-- | kernel/cgroup/rstat.c | 2 |
5 files changed, 128 insertions, 119 deletions
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 35b920328344..81c9e0685948 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -63,9 +63,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) for_each_root(root) { struct cgroup *from_cgrp; - if (root == &cgrp_dfl_root) - continue; - spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); spin_unlock_irq(&css_set_lock); @@ -662,11 +659,9 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* - * ideally we don't want subsystems moving around while we do this. - * cgroup_mutex is also necessary to guarantee an atomic snapshot of - * subsys/hierarchy state. + * Grab the subsystems state racily. No need to add avenue to + * cgroup_mutex contention. */ - mutex_lock(&cgroup_mutex); for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", @@ -674,7 +669,6 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); - mutex_unlock(&cgroup_mutex); return 0; } @@ -701,8 +695,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) kernfs_type(kn) != KERNFS_DIR) return -EINVAL; - mutex_lock(&cgroup_mutex); - /* * We aren't being called from kernfs and there's no guarantee on * @kn->priv's validity. For this and css_tryget_online_from_dir(), @@ -710,9 +702,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) */ rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); - if (!cgrp || cgroup_is_dead(cgrp)) { + if (!cgrp || !cgroup_tryget(cgrp)) { rcu_read_unlock(); - mutex_unlock(&cgroup_mutex); return -ENOENT; } rcu_read_unlock(); @@ -740,7 +731,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } css_task_iter_end(&it); - mutex_unlock(&cgroup_mutex); + cgroup_put(cgrp); return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8afa8690d288..919194de39c8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1740,6 +1740,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, i, ret; + u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1756,8 +1757,28 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; + + /* + * Collect ssid's that need to be disabled from default + * hierarchy. + */ + if (ss->root == &cgrp_dfl_root) + dfl_disable_ss_mask |= 1 << ssid; + } while_each_subsys_mask(); + if (dfl_disable_ss_mask) { + struct cgroup *scgrp = &cgrp_dfl_root.cgrp; + + /* + * Controllers from default hierarchy that need to be rebound + * are all disabled together in one go. + */ + cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; @@ -1766,10 +1787,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) WARN_ON(!css || cgroup_css(dcgrp, ss)); - /* disable from the source */ - src_root->subsys_mask &= ~(1 << ssid); - WARN_ON(cgroup_apply_control(scgrp)); - cgroup_finalize_control(scgrp, 0); + if (src_root != &cgrp_dfl_root) { + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); @@ -2187,8 +2210,10 @@ static void cgroup_kill_sb(struct super_block *sb) * And don't kill the default root. */ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && - !percpu_ref_is_dying(&root->cgrp.self.refcnt)) + !percpu_ref_is_dying(&root->cgrp.self.refcnt)) { + cgroup_bpf_offline(&root->cgrp); percpu_ref_kill(&root->cgrp.self.refcnt); + } cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } @@ -5909,17 +5934,20 @@ struct cgroup *cgroup_get_from_id(u64 id) struct kernfs_node *kn; struct cgroup *cgrp = NULL; - mutex_lock(&cgroup_mutex); kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) - goto out_unlock; + goto out; + + rcu_read_lock(); - cgrp = kn->priv; - if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp)) + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + if (cgrp && !cgroup_tryget(cgrp)) cgrp = NULL; + + rcu_read_unlock(); + kernfs_put(kn); -out_unlock: - mutex_unlock(&cgroup_mutex); +out: return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_id); @@ -6472,30 +6500,34 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on - * success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR) - * if @path points to a non-directory. + * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already + * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory. */ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; - struct cgroup *cgrp; - - mutex_lock(&cgroup_mutex); + struct cgroup *cgrp = ERR_PTR(-ENOENT); kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); - if (kn) { - if (kernfs_type(kn) == KERNFS_DIR) { - cgrp = kn->priv; - cgroup_get_live(cgrp); - } else { - cgrp = ERR_PTR(-ENOTDIR); - } - kernfs_put(kn); - } else { - cgrp = ERR_PTR(-ENOENT); + if (!kn) + goto out; + + if (kernfs_type(kn) != KERNFS_DIR) { + cgrp = ERR_PTR(-ENOTDIR); + goto out_kernfs; } - mutex_unlock(&cgroup_mutex); + rcu_read_lock(); + + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + if (!cgrp || !cgroup_tryget(cgrp)) + cgrp = ERR_PTR(-ENOENT); + + rcu_read_unlock(); + +out_kernfs: + kernfs_put(kn); +out: return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_path); @@ -6574,22 +6606,29 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - /* Don't associate the sock with unrelated interrupted task's cgroup. */ - if (in_interrupt()) - return; + struct cgroup *cgroup; rcu_read_lock(); + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) { + cgroup = &cgrp_dfl_root.cgrp; + cgroup_get(cgroup); + goto out; + } + while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->cgroup = cset->dfl_cgrp; - cgroup_bpf_get(cset->dfl_cgrp); + cgroup = cset->dfl_cgrp; break; } cpu_relax(); } +out: + skcd->cgroup = cgroup; + cgroup_bpf_get(cgroup); rcu_read_unlock(); } @@ -6616,44 +6655,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ -#ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_attach(struct cgroup *cgrp, - struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, - enum bpf_attach_type type, - u32 flags) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); - mutex_unlock(&cgroup_mutex); - return ret; -} - -int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); - mutex_unlock(&cgroup_mutex); - return ret; -} - -int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_query(cgrp, attr, uattr); - mutex_unlock(&cgroup_mutex); - return ret; -} -#endif /* CONFIG_CGROUP_BPF */ - #ifdef CONFIG_SYSFS static ssize_t show_delegatable_files(struct cftype *files, char *buf, ssize_t size, const char *prefix) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index df1ccf4558f8..2a9695ccb65f 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -311,17 +311,19 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* - * There are two global locks guarding cpuset structures - cpuset_mutex and + * There are two global locks guarding cpuset structures - cpuset_rwsem and * callback_lock. We also require taking task_lock() when dereferencing a * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. + * comment. The cpuset code uses only cpuset_rwsem write lock. Other + * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to + * prevent change to cpuset structures. * * A task must hold both locks to modify cpusets. If a task holds - * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it + * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it * is the only task able to also acquire callback_lock and be able to * modify cpusets. It can perform various checks on the cpuset structure * first, knowing nothing will change. It can also allocate memory while - * just holding cpuset_mutex. While it is performing these checks, various + * just holding cpuset_rwsem. While it is performing these checks, various * callback routines can briefly acquire callback_lock to query cpusets. * Once it is ready to make the changes, it takes callback_lock, blocking * everyone else. @@ -393,7 +395,7 @@ static inline bool is_in_v2_mode(void) * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void guarantee_online_cpus(struct task_struct *tsk, struct cpumask *pmask) @@ -435,7 +437,7 @@ out_unlock: * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { @@ -447,7 +449,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -468,7 +470,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_mutex. + * are only set if the other's are set. Call holding cpuset_rwsem. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -577,7 +579,7 @@ static inline void free_cpuset(struct cpuset *cs) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_mutex held. + * cpuset_rwsem held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -700,7 +702,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } -/* Must be called with cpuset_mutex held. */ +/* Must be called with cpuset_rwsem held. */ static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ @@ -726,7 +728,7 @@ static inline int nr_cpusets(void) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cpuset_mutex held. + * Must be called with cpuset_rwsem held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a @@ -1005,7 +1007,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes cpus_read_lock(). + * Call with cpuset_rwsem held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { @@ -1078,7 +1080,7 @@ void rebuild_sched_domains(void) * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * * Iterate through each task of @cs updating its cpus_allowed to the - * effective cpuset's. As this function is called with cpuset_mutex held, + * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ static void update_tasks_cpumask(struct cpuset *cs) @@ -1347,7 +1349,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * - * Called with cpuset_mutex held + * Called with cpuset_rwsem held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) { @@ -1704,12 +1706,12 @@ static void *cpuset_being_rebound; * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the - * effective cpuset's. As this function is called with cpuset_mutex held, + * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ static void update_tasks_nodemask(struct cpuset *cs) { - static nodemask_t newmems; /* protected by cpuset_mutex */ + static nodemask_t newmems; /* protected by cpuset_rwsem */ struct css_task_iter it; struct task_struct *task; @@ -1722,7 +1724,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cpuset_mutex, we know that no other rebind effort + * the global cpuset_rwsem, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1768,7 +1770,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * - * Called with cpuset_mutex held + * Called with cpuset_rwsem held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { @@ -1821,7 +1823,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_mutex held. May take callback_lock during call. + * Call with cpuset_rwsem held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1911,7 +1913,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * @cs: the cpuset in which each task's spread flags needs to be changed * * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_mutex held, cpuset membership stays + * function is called with cpuset_rwsem held, cpuset membership stays * stable. */ static void update_tasks_flags(struct cpuset *cs) @@ -1931,7 +1933,7 @@ static void update_tasks_flags(struct cpuset *cs) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cpuset_mutex held. + * Call with cpuset_rwsem held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1980,7 +1982,7 @@ out: * cs: the cpuset to update * new_prs: new partition root state * - * Call with cpuset_mutex held. + * Call with cpuset_rwsem held. */ static int update_prstate(struct cpuset *cs, int new_prs) { @@ -2167,7 +2169,7 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; -/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; @@ -2219,7 +2221,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) } /* - * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ @@ -2227,7 +2229,7 @@ static cpumask_var_t cpus_attach; static void cpuset_attach(struct cgroup_taskset *tset) { - /* static buf protected by cpuset_mutex */ + /* static buf protected by cpuset_rwsem */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; @@ -2417,7 +2419,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after - * grabbing cpuset_mutex anyway. This only happens on the legacy + * grabbing cpuset_rwsem anyway. This only happens on the legacy * hierarchies. */ css_get(&cs->css); @@ -3672,7 +3674,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it + * and we take cpuset_rwsem, keeping cpuset_attach() from changing it * anyway. */ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index ec02d963cad1..fe3e8a0eb7ed 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -157,13 +157,6 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, new_usage = atomic_long_add_return(amount, &res->usage); if (new_usage > READ_ONCE(res->max) || new_usage > READ_ONCE(misc_res_capacity[type])) { - if (!res->failed) { - pr_info("cgroup: charge rejected by the misc controller for %s resource in ", - misc_res_name[type]); - pr_cont_cgroup_path(i->css.cgroup); - pr_cont("\n"); - res->failed = true; - } ret = -EBUSY; goto err_charge; } @@ -171,6 +164,11 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, return 0; err_charge: + for (j = i; j; j = parent_misc(j)) { + atomic_long_inc(&j->res[type].events); + cgroup_file_notify(&j->events_file); + } + for (j = cg; j != i; j = parent_misc(j)) misc_cg_cancel_charge(type, j, amount); misc_cg_cancel_charge(type, i, amount); @@ -335,6 +333,19 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v) return 0; } +static int misc_events_show(struct seq_file *sf, void *v) +{ + struct misc_cg *cg = css_misc(seq_css(sf)); + unsigned long events, i; + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + events = atomic_long_read(&cg->res[i].events); + if (READ_ONCE(misc_res_capacity[i]) || events) + seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events); + } + return 0; +} + /* Misc cgroup interface files */ static struct cftype misc_cg_files[] = { { @@ -353,6 +364,12 @@ static struct cftype misc_cg_files[] = { .seq_show = misc_cg_capacity_show, .flags = CFTYPE_ONLY_ON_ROOT, }, + { + .name = "events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct misc_cg, events_file), + .seq_show = misc_events_show, + }, {} }; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b264ab5652ba..1486768f2318 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -433,8 +433,6 @@ static void root_cgroup_cputime(struct task_cputime *cputime) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; - cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; - cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; } } |