aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/cgroup-v1/pids.rst3
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst47
-rw-r--r--MAINTAINERS1
-rw-r--r--include/linux/cgroup-defs.h7
-rw-r--r--include/linux/misc_cgroup.h12
-rw-r--r--kernel/cgroup/cgroup.c36
-rw-r--r--kernel/cgroup/cpuset.c197
-rw-r--r--kernel/cgroup/misc.c80
-rw-r--r--kernel/cgroup/pids.c129
-rw-r--r--kernel/cgroup/rstat.c37
-rw-r--r--tools/testing/selftests/cgroup/.gitignore11
-rw-r--r--tools/testing/selftests/cgroup/Makefile25
-rwxr-xr-xtools/testing/selftests/cgroup/test_cpuset_prs.sh75
-rw-r--r--tools/testing/selftests/cgroup/test_pids.c178
14 files changed, 679 insertions, 159 deletions
diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst
index 6acebd9e72c8..0f9f9a7b1f6c 100644
--- a/Documentation/admin-guide/cgroup-v1/pids.rst
+++ b/Documentation/admin-guide/cgroup-v1/pids.rst
@@ -36,7 +36,8 @@ superset of parent/child/pids.current.
The pids.events file contains event counters:
- - max: Number of times fork failed because limit was hit.
+ - max: Number of times fork failed in the cgroup because limit was hit in
+ self or ancestors.
Example
-------
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 8fbb0519d556..05862f06ed26 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -239,6 +239,13 @@ cgroup v2 currently supports the following mount options.
will not be tracked by the memory controller (even if cgroup
v2 is remounted later on).
+ pids_localevents
+ The option restores v1-like behavior of pids.events:max, that is only
+ local (inside cgroup proper) fork failures are counted. Without this
+ option pids.events.max represents any pids.max enforcemnt across
+ cgroup's subtree.
+
+
Organizing Processes and Threads
--------------------------------
@@ -2205,12 +2212,18 @@ PID Interface Files
descendants has ever reached.
pids.events
- A read-only flat-keyed file which exists on non-root cgroups. The
- following entries are defined. Unless specified otherwise, a value
- change in this file generates a file modified event.
+ A read-only flat-keyed file which exists on non-root cgroups. Unless
+ specified otherwise, a value change in this file generates a file
+ modified event. The following entries are defined.
max
- Number of times fork failed because limit was hit.
+ The number of times the cgroup's total number of processes hit the pids.max
+ limit (see also pids_localevents).
+
+ pids.events.local
+ Similar to pids.events but the fields in the file are local
+ to the cgroup i.e. not hierarchical. The file modified event
+ generated on this file reflects only the local events.
Organisational operations are not blocked by cgroup policies, so it is
possible to have pids.current > pids.max. This can be done by either
@@ -2346,8 +2359,12 @@ Cpuset Interface Files
is always a subset of it.
Users can manually set it to a value that is different from
- "cpuset.cpus". The only constraint in setting it is that the
- list of CPUs must be exclusive with respect to its sibling.
+ "cpuset.cpus". One constraint in setting it is that the list of
+ CPUs must be exclusive with respect to "cpuset.cpus.exclusive"
+ of its sibling. If "cpuset.cpus.exclusive" of a sibling cgroup
+ isn't set, its "cpuset.cpus" value, if set, cannot be a subset
+ of it to leave at least one CPU available when the exclusive
+ CPUs are taken away.
For a parent cgroup, any one of its exclusive CPUs can only
be distributed to at most one of its child cgroups. Having an
@@ -2363,8 +2380,8 @@ Cpuset Interface Files
cpuset-enabled cgroups.
This file shows the effective set of exclusive CPUs that
- can be used to create a partition root. The content of this
- file will always be a subset of "cpuset.cpus" and its parent's
+ can be used to create a partition root. The content
+ of this file will always be a subset of its parent's
"cpuset.cpus.exclusive.effective" if its parent is not the root
cgroup. It will also be a subset of "cpuset.cpus.exclusive"
if it is set. If "cpuset.cpus.exclusive" is not set, it is
@@ -2625,6 +2642,15 @@ Miscellaneous controller provides 3 interface files. If two misc resources (res_
res_a 3
res_b 0
+ misc.peak
+ A read-only flat-keyed file shown in all cgroups. It shows the
+ historical maximum usage of the resources in the cgroup and its
+ children.::
+
+ $ cat misc.peak
+ res_a 10
+ res_b 8
+
misc.max
A read-write flat-keyed file shown in the non root cgroups. Allowed
maximum usage of the resources in the cgroup and its children.::
@@ -2654,6 +2680,11 @@ Miscellaneous controller provides 3 interface files. If two misc resources (res_
The number of times the cgroup's resource usage was
about to go over the max boundary.
+ misc.events.local
+ Similar to misc.events but the fields in the file are local to the
+ cgroup i.e. not hierarchical. The file modified event generated on
+ this file reflects only the local events.
+
Migration and Ownership
~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/MAINTAINERS b/MAINTAINERS
index b1ce34379868..9cae268966e7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5528,6 +5528,7 @@ CONTROL GROUP (CGROUP)
M: Tejun Heo <[email protected]>
M: Zefan Li <[email protected]>
M: Johannes Weiner <[email protected]>
+M: Michal Koutný <[email protected]>
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ea48c861cd36..b36690ca0d3f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -119,7 +119,12 @@ enum {
/*
* Enable hugetlb accounting for the memory controller.
*/
- CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
+ CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
+
+ /*
+ * Enable legacy local pids.events.
+ */
+ CGRP_ROOT_PIDS_LOCAL_EVENTS = (1 << 20),
};
/* cftype->flags */
diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index e799b1f8d05b..49eef10c8e59 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -9,15 +9,16 @@
#define _MISC_CGROUP_H_
/**
- * Types of misc cgroup entries supported by the host.
+ * enum misc_res_type - Types of misc cgroup entries supported by the host.
*/
enum misc_res_type {
#ifdef CONFIG_KVM_AMD_SEV
- /* AMD SEV ASIDs resource */
+ /** @MISC_CG_RES_SEV: AMD SEV ASIDs resource */
MISC_CG_RES_SEV,
- /* AMD SEV-ES ASIDs resource */
+ /** @MISC_CG_RES_SEV_ES: AMD SEV-ES ASIDs resource */
MISC_CG_RES_SEV_ES,
#endif
+ /** @MISC_CG_RES_TYPES: count of enum misc_res_type constants */
MISC_CG_RES_TYPES
};
@@ -30,13 +31,16 @@ struct misc_cg;
/**
* struct misc_res: Per cgroup per misc type resource
* @max: Maximum limit on the resource.
+ * @watermark: Historical maximum usage of the resource.
* @usage: Current usage of the resource.
* @events: Number of times, the resource limit exceeded.
*/
struct misc_res {
u64 max;
+ atomic64_t watermark;
atomic64_t usage;
atomic64_t events;
+ atomic64_t events_local;
};
/**
@@ -50,6 +54,8 @@ struct misc_cg {
/* misc.events */
struct cgroup_file events_file;
+ /* misc.events.local */
+ struct cgroup_file events_local_file;
struct misc_res res[MISC_CG_RES_TYPES];
};
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e32b6972c478..c8e4b62b436a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1744,8 +1744,11 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
if (cgroup_psi_enabled()) {
ret = cgroup_addrm_files(css, cgrp,
cgroup_psi_files, true);
- if (ret < 0)
+ if (ret < 0) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
return ret;
+ }
}
} else {
ret = cgroup_addrm_files(css, cgrp,
@@ -1839,9 +1842,9 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
- css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
+ css->cgroup = dcgrp;
WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
e_cset_node[ss->id]) {
@@ -1922,6 +1925,7 @@ enum cgroup2_param {
Opt_memory_localevents,
Opt_memory_recursiveprot,
Opt_memory_hugetlb_accounting,
+ Opt_pids_localevents,
nr__cgroup2_params
};
@@ -1931,6 +1935,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
+ fsparam_flag("pids_localevents", Opt_pids_localevents),
{}
};
@@ -1960,6 +1965,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_memory_hugetlb_accounting:
ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
return 0;
+ case Opt_pids_localevents:
+ ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ return 0;
}
return -EINVAL;
}
@@ -1989,6 +1997,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+
+ if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS;
}
}
@@ -2004,6 +2017,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
seq_puts(seq, ",memory_recursiveprot");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
seq_puts(seq, ",memory_hugetlb_accounting");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ seq_puts(seq, ",pids_localevents");
return 0;
}
@@ -6686,8 +6701,10 @@ void cgroup_exit(struct task_struct *tsk)
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
css_set_move_task(tsk, cset, NULL, false);
- list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
+ /* matches the signal->live check in css_task_iter_advance() */
+ if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
if (dl_task(tsk))
dec_dl_tasks_cs(tsk);
@@ -6714,10 +6731,12 @@ void cgroup_release(struct task_struct *task)
ss->release(task);
} while_each_subsys_mask();
- spin_lock_irq(&css_set_lock);
- css_set_skip_task_iters(task_css_set(task), task);
- list_del_init(&task->cg_list);
- spin_unlock_irq(&css_set_lock);
+ if (!list_empty(&task->cg_list)) {
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
+ }
}
void cgroup_free(struct task_struct *task)
@@ -7062,7 +7081,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
"favordynmods\n"
"memory_localevents\n"
"memory_recursiveprot\n"
- "memory_hugetlb_accounting\n");
+ "memory_hugetlb_accounting\n"
+ "pids_localevents\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c12b9fdb22a4..40ec4abaf440 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -21,6 +21,7 @@
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
+#include "cgroup-internal.h"
#include <linux/cpu.h>
#include <linux/cpumask.h>
@@ -87,7 +88,7 @@ static const char * const perr_strings[] = {
[PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
[PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
[PERR_HOTPLUG] = "No cpu available due to hotplug",
- [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+ [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
[PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
};
@@ -127,19 +128,28 @@ struct cpuset {
/*
* Exclusive CPUs dedicated to current cgroup (default hierarchy only)
*
- * This exclusive CPUs must be a subset of cpus_allowed. A parent
- * cgroup can only grant exclusive CPUs to one of its children.
+ * The effective_cpus of a valid partition root comes solely from its
+ * effective_xcpus and some of the effective_xcpus may be distributed
+ * to sub-partitions below & hence excluded from its effective_cpus.
+ * For a valid partition root, its effective_cpus have no relationship
+ * with cpus_allowed unless its exclusive_cpus isn't set.
*
- * When the cgroup becomes a valid partition root, effective_xcpus
- * defaults to cpus_allowed if not set. The effective_cpus of a valid
- * partition root comes solely from its effective_xcpus and some of the
- * effective_xcpus may be distributed to sub-partitions below & hence
- * excluded from its effective_cpus.
+ * This value will only be set if either exclusive_cpus is set or
+ * when this cpuset becomes a local partition root.
*/
cpumask_var_t effective_xcpus;
/*
* Exclusive CPUs as requested by the user (default hierarchy only)
+ *
+ * Its value is independent of cpus_allowed and designates the set of
+ * CPUs that can be granted to the current cpuset or its children when
+ * it becomes a valid partition root. The effective set of exclusive
+ * CPUs granted (effective_xcpus) depends on whether those exclusive
+ * CPUs are passed down by its ancestors and not yet taken up by
+ * another sibling partition root along the way.
+ *
+ * If its value isn't set, it defaults to cpus_allowed.
*/
cpumask_var_t exclusive_cpus;
@@ -169,7 +179,7 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* number of valid sub-partitions */
+ /* number of valid local child partitions */
int nr_subparts;
/* partition root state */
@@ -230,6 +240,17 @@ static struct list_head remote_children;
* 2 - partition root without load balancing (isolated)
* -1 - invalid partition root
* -2 - invalid isolated partition root
+ *
+ * There are 2 types of partitions - local or remote. Local partitions are
+ * those whose parents are partition root themselves. Setting of
+ * cpuset.cpus.exclusive are optional in setting up local partitions.
+ * Remote partitions are those whose parents are not partition roots. Passing
+ * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor
+ * nodes are mandatory in creating a remote partition.
+ *
+ * For simplicity, a local partition can be created under a local or remote
+ * partition but a remote partition cannot have any partition root in its
+ * ancestor chain except the cgroup root.
*/
#define PRS_MEMBER 0
#define PRS_ROOT 1
@@ -434,7 +455,7 @@ static struct cpuset top_cpuset = {
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
- * The cpuset_common_file_read() handlers only hold callback_lock across
+ * The cpuset_common_seq_show() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*
@@ -709,6 +730,19 @@ static inline void free_cpuset(struct cpuset *cs)
kfree(cs);
}
+/* Return user specified exclusive CPUs */
+static inline struct cpumask *user_xcpus(struct cpuset *cs)
+{
+ return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
+ : cs->exclusive_cpus;
+}
+
+static inline bool xcpus_empty(struct cpuset *cs)
+{
+ return cpumask_empty(cs->cpus_allowed) &&
+ cpumask_empty(cs->exclusive_cpus);
+}
+
static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
{
return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
@@ -825,17 +859,41 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/*
* If either I or some sibling (!= me) is exclusive, we can't
- * overlap
+ * overlap. exclusive_cpus cannot overlap with each other if set.
*/
ret = -EINVAL;
cpuset_for_each_child(c, css, par) {
- if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur) {
+ bool txset, cxset; /* Are exclusive_cpus set? */
+
+ if (c == cur)
+ continue;
+
+ txset = !cpumask_empty(trial->exclusive_cpus);
+ cxset = !cpumask_empty(c->exclusive_cpus);
+ if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) ||
+ (txset && cxset)) {
if (!cpusets_are_exclusive(trial, c))
goto out;
+ } else if (txset || cxset) {
+ struct cpumask *xcpus, *acpus;
+
+ /*
+ * When just one of the exclusive_cpus's is set,
+ * cpus_allowed of the other cpuset, if set, cannot be
+ * a subset of it or none of those CPUs will be
+ * available if these exclusive CPUs are activated.
+ */
+ if (txset) {
+ xcpus = trial->exclusive_cpus;
+ acpus = c->cpus_allowed;
+ } else {
+ xcpus = c->exclusive_cpus;
+ acpus = trial->cpus_allowed;
+ }
+ if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus))
+ goto out;
}
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
goto out;
}
@@ -957,13 +1015,15 @@ static int generate_sched_domains(cpumask_var_t **domains,
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
bool root_load_balance = is_sched_load_balance(&top_cpuset);
+ bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && !top_cpuset.nr_subparts) {
+ if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
+single_root_domain:
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -991,16 +1051,18 @@ static int generate_sched_domains(cpumask_var_t **domains,
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
if (cp == &top_cpuset)
continue;
+
+ if (cgrpv2)
+ goto v2;
+
/*
+ * v1:
* Continue traversing beyond @cp iff @cp has some CPUs and
* isn't load balancing. The former is obvious. The
* latter: All child cpusets contain a subset of the
* parent's cpus, so just skip them, and then we call
* update_domain_attr_tree() to calc relax_domain_level of
* the corresponding sched domain.
- *
- * If root is load-balancing, we can skip @cp if it
- * is a subset of the root's effective_cpus.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
@@ -1008,20 +1070,39 @@ static int generate_sched_domains(cpumask_var_t **domains,
housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;
- if (root_load_balance &&
- cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
- continue;
-
if (is_sched_load_balance(cp) &&
!cpumask_empty(cp->effective_cpus))
csa[csn++] = cp;
- /* skip @cp's subtree if not a partition root */
- if (!is_partition_valid(cp))
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+
+v2:
+ /*
+ * Only valid partition roots that are not isolated and with
+ * non-empty effective_cpus will be saved into csn[].
+ */
+ if ((cp->partition_root_state == PRS_ROOT) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /*
+ * Skip @cp's subtree if not a partition root and has no
+ * exclusive CPUs to be granted to child cpusets.
+ */
+ if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
+ /*
+ * If there are only isolated partitions underneath the cgroup root,
+ * we can optimize out unneeded sched domains scanning.
+ */
+ if (root_load_balance && (csn == 1))
+ goto single_root_domain;
+
for (i = 0; i < csn; i++)
csa[i]->pn = i;
ndoms = csn;
@@ -1064,6 +1145,20 @@ restart:
dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
GFP_KERNEL);
+ /*
+ * Cgroup v2 doesn't support domain attributes, just set all of them
+ * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
+ * subset of HK_TYPE_DOMAIN housekeeping CPUs.
+ */
+ if (cgrpv2) {
+ for (i = 0; i < ndoms; i++) {
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
+ }
+ goto done;
+ }
+
for (nslot = 0, i = 0; i < csn; i++) {
struct cpuset *a = csa[i];
struct cpumask *dp;
@@ -1223,7 +1318,7 @@ static void rebuild_sched_domains_locked(void)
* root should be only a subset of the active CPUs. Since a CPU in any
* partition root could be offlined, all must be checked.
*/
- if (top_cpuset.nr_subparts) {
+ if (!cpumask_empty(subpartitions_cpus)) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (!is_partition_valid(cs)) {
@@ -1338,7 +1433,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
*/
static int update_partition_exclusive(struct cpuset *cs, int new_prs)
{
- bool exclusive = (new_prs > 0);
+ bool exclusive = (new_prs > PRS_MEMBER);
if (exclusive && !is_cpu_exclusive(cs)) {
if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
@@ -1532,7 +1627,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
* Return: true if xcpus is not empty, false otherwise.
*
* Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
- * it must be a subset of cpus_allowed and parent's effective_xcpus.
+ * it must be a subset of parent's effective_xcpus.
*/
static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
struct cpumask *xcpus)
@@ -1542,12 +1637,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
if (!xcpus)
xcpus = cs->effective_xcpus;
- if (!cpumask_empty(cs->exclusive_cpus))
- cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
- else
- cpumask_copy(xcpus, cs->cpus_allowed);
-
- return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
+ return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
}
static inline bool is_remote_partition(struct cpuset *cs)
@@ -1826,8 +1916,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
*/
adding = deleting = false;
old_prs = new_prs = cs->partition_root_state;
- xcpus = !cpumask_empty(cs->exclusive_cpus)
- ? cs->effective_xcpus : cs->cpus_allowed;
+ xcpus = user_xcpus(cs);
if (cmd == partcmd_invalidate) {
if (is_prs_invalid(old_prs))
@@ -1855,7 +1944,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
return is_partition_invalid(parent)
? PERR_INVPARENT : PERR_NOTPART;
}
- if (!newmask && cpumask_empty(cs->cpus_allowed))
+ if (!newmask && xcpus_empty(cs))
return PERR_CPUSEMPTY;
nocpu = tasks_nocpu_error(parent, cs, xcpus);
@@ -2583,8 +2672,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
retval = cpulist_parse(buf, trialcs->exclusive_cpus);
if (retval < 0)
return retval;
- if (!is_cpu_exclusive(cs))
- set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
}
/* Nothing to do if the CPUs didn't change */
@@ -3071,9 +3158,9 @@ static int update_prstate(struct cpuset *cs, int new_prs)
? partcmd_enable : partcmd_enablei;
/*
- * cpus_allowed cannot be empty.
+ * cpus_allowed and exclusive_cpus cannot be both empty.
*/
- if (cpumask_empty(cs->cpus_allowed)) {
+ if (xcpus_empty(cs)) {
err = PERR_CPUSEMPTY;
goto out;
}
@@ -4009,8 +4096,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
}
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- nodes_clear(cs->mems_allowed);
- nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
INIT_LIST_HEAD(&cs->remote_sibling);
@@ -4040,6 +4125,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
set_bit(CS_SPREAD_SLAB, &cs->flags);
+ /*
+ * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpuset_inc();
@@ -4050,14 +4141,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
}
-
- /*
- * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
- */
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_sched_load_balance(parent))
- clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-
spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
@@ -4571,7 +4654,7 @@ static void cpuset_handle_hotplug(void)
* In the rare case that hotplug removes all the cpus in
* subpartitions_cpus, we assumed that cpus are updated.
*/
- if (!cpus_updated && top_cpuset.nr_subparts)
+ if (!cpus_updated && !cpumask_empty(subpartitions_cpus))
cpus_updated = true;
/* For v1, synchronize cpus_allowed to cpu_active_mask */
@@ -5051,10 +5134,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- css = task_get_css(tsk, cpuset_cgrp_id);
- retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
- css_put(css);
+ rcu_read_lock();
+ spin_lock_irq(&css_set_lock);
+ css = task_css(tsk, cpuset_cgrp_id);
+ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ rcu_read_unlock();
+
if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 79a3717a5803..0e26068995a6 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -121,6 +121,30 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
misc_res_name[type]);
}
+static void misc_cg_update_watermark(struct misc_res *res, u64 new_usage)
+{
+ u64 old;
+
+ while (true) {
+ old = atomic64_read(&res->watermark);
+ if (new_usage <= old)
+ break;
+ if (atomic64_cmpxchg(&res->watermark, old, new_usage) == old)
+ break;
+ }
+}
+
+static void misc_cg_event(enum misc_res_type type, struct misc_cg *cg)
+{
+ atomic64_inc(&cg->res[type].events_local);
+ cgroup_file_notify(&cg->events_local_file);
+
+ for (; parent_misc(cg); cg = parent_misc(cg)) {
+ atomic64_inc(&cg->res[type].events);
+ cgroup_file_notify(&cg->events_file);
+ }
+}
+
/**
* misc_cg_try_charge() - Try charging the misc cgroup.
* @type: Misc res type to charge.
@@ -159,14 +183,12 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
ret = -EBUSY;
goto err_charge;
}
+ misc_cg_update_watermark(res, new_usage);
}
return 0;
err_charge:
- for (j = i; j; j = parent_misc(j)) {
- atomic64_inc(&j->res[type].events);
- cgroup_file_notify(&j->events_file);
- }
+ misc_cg_event(type, i);
for (j = cg; j != i; j = parent_misc(j))
misc_cg_cancel_charge(type, j, amount);
@@ -308,6 +330,29 @@ static int misc_cg_current_show(struct seq_file *sf, void *v)
}
/**
+ * misc_cg_peak_show() - Show the peak usage of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_peak_show(struct seq_file *sf, void *v)
+{
+ int i;
+ u64 watermark;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ watermark = atomic64_read(&cg->res[i].watermark);
+ if (READ_ONCE(misc_res_capacity[i]) || watermark)
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], watermark);
+ }
+
+ return 0;
+}
+
+/**
* misc_cg_capacity_show() - Show the total capacity of misc res on the host.
* @sf: Interface file
* @v: Arguments passed
@@ -331,20 +376,33 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
return 0;
}
-static int misc_events_show(struct seq_file *sf, void *v)
+static int __misc_events_show(struct seq_file *sf, bool local)
{
struct misc_cg *cg = css_misc(seq_css(sf));
u64 events;
int i;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
- events = atomic64_read(&cg->res[i].events);
+ if (local)
+ events = atomic64_read(&cg->res[i].events_local);
+ else
+ events = atomic64_read(&cg->res[i].events);
if (READ_ONCE(misc_res_capacity[i]) || events)
seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events);
}
return 0;
}
+static int misc_events_show(struct seq_file *sf, void *v)
+{
+ return __misc_events_show(sf, false);
+}
+
+static int misc_events_local_show(struct seq_file *sf, void *v)
+{
+ return __misc_events_show(sf, true);
+}
+
/* Misc cgroup interface files */
static struct cftype misc_cg_files[] = {
{
@@ -358,6 +416,10 @@ static struct cftype misc_cg_files[] = {
.seq_show = misc_cg_current_show,
},
{
+ .name = "peak",
+ .seq_show = misc_cg_peak_show,
+ },
+ {
.name = "capacity",
.seq_show = misc_cg_capacity_show,
.flags = CFTYPE_ONLY_ON_ROOT,
@@ -368,6 +430,12 @@ static struct cftype misc_cg_files[] = {
.file_offset = offsetof(struct misc_cg, events_file),
.seq_show = misc_events_show,
},
+ {
+ .name = "events.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct misc_cg, events_local_file),
+ .seq_show = misc_events_local_show,
+ },
{}
};
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 0e5ec7d59b4d..f5cb0ec45b9d 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -38,6 +38,14 @@
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"
+enum pidcg_event {
+ /* Fork failed in subtree because this pids_cgroup limit was hit. */
+ PIDCG_MAX,
+ /* Fork failed in this pids_cgroup because ancestor limit was hit. */
+ PIDCG_FORKFAIL,
+ NR_PIDCG_EVENTS,
+};
+
struct pids_cgroup {
struct cgroup_subsys_state css;
@@ -49,11 +57,12 @@ struct pids_cgroup {
atomic64_t limit;
int64_t watermark;
- /* Handle for "pids.events" */
+ /* Handles for pids.events[.local] */
struct cgroup_file events_file;
+ struct cgroup_file events_local_file;
- /* Number of times fork failed because limit was hit. */
- atomic64_t events_limit;
+ atomic64_t events[NR_PIDCG_EVENTS];
+ atomic64_t events_local[NR_PIDCG_EVENTS];
};
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -148,12 +157,13 @@ static void pids_charge(struct pids_cgroup *pids, int num)
* pids_try_charge - hierarchically try to charge the pid count
* @pids: the pid cgroup state
* @num: the number of pids to charge
+ * @fail: storage of pid cgroup causing the fail
*
* This function follows the set limit. It will fail if the charge would cause
* the new value to exceed the hierarchical limit. Returns 0 if the charge
* succeeded, otherwise -EAGAIN.
*/
-static int pids_try_charge(struct pids_cgroup *pids, int num)
+static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail)
{
struct pids_cgroup *p, *q;
@@ -166,9 +176,10 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
* p->limit is %PIDS_MAX then we know that this test will never
* fail.
*/
- if (new > limit)
+ if (new > limit) {
+ *fail = p;
goto revert;
-
+ }
/*
* Not technically accurate if we go over limit somewhere up
* the hierarchy, but that's tolerable for the watermark.
@@ -229,6 +240,36 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
}
}
+static void pids_event(struct pids_cgroup *pids_forking,
+ struct pids_cgroup *pids_over_limit)
+{
+ struct pids_cgroup *p = pids_forking;
+ bool limit = false;
+
+ /* Only log the first time limit is hit. */
+ if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) {
+ pr_info("cgroup: fork rejected by pids controller in ");
+ pr_cont_cgroup_path(p->css.cgroup);
+ pr_cont("\n");
+ }
+ cgroup_file_notify(&p->events_local_file);
+ if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
+ cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ return;
+
+ for (; parent_pids(p); p = parent_pids(p)) {
+ if (p == pids_over_limit) {
+ limit = true;
+ atomic64_inc(&p->events_local[PIDCG_MAX]);
+ cgroup_file_notify(&p->events_local_file);
+ }
+ if (limit)
+ atomic64_inc(&p->events[PIDCG_MAX]);
+
+ cgroup_file_notify(&p->events_file);
+ }
+}
+
/*
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on cgroup_threadgroup_change_begin() held by the copy_process().
@@ -236,7 +277,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{
struct cgroup_subsys_state *css;
- struct pids_cgroup *pids;
+ struct pids_cgroup *pids, *pids_over_limit;
int err;
if (cset)
@@ -244,16 +285,10 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset)
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
- err = pids_try_charge(pids, 1);
- if (err) {
- /* Only log the first time events_limit is incremented. */
- if (atomic64_inc_return(&pids->events_limit) == 1) {
- pr_info("cgroup: fork rejected by pids controller in ");
- pr_cont_cgroup_path(css->cgroup);
- pr_cont("\n");
- }
- cgroup_file_notify(&pids->events_file);
- }
+ err = pids_try_charge(pids, 1, &pids_over_limit);
+ if (err)
+ pids_event(pids, pids_over_limit);
+
return err;
}
@@ -337,11 +372,32 @@ static s64 pids_peak_read(struct cgroup_subsys_state *css,
return READ_ONCE(pids->watermark);
}
-static int pids_events_show(struct seq_file *sf, void *v)
+static int __pids_events_show(struct seq_file *sf, bool local)
{
struct pids_cgroup *pids = css_pids(seq_css(sf));
+ enum pidcg_event pe = PIDCG_MAX;
+ atomic64_t *events;
+
+ if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
+ cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
+ pe = PIDCG_FORKFAIL;
+ local = true;
+ }
+ events = local ? pids->events_local : pids->events;
+
+ seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe]));
+ return 0;
+}
- seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+ __pids_events_show(sf, false);
+ return 0;
+}
+
+static int pids_events_local_show(struct seq_file *sf, void *v)
+{
+ __pids_events_show(sf, true);
return 0;
}
@@ -368,9 +424,42 @@ static struct cftype pids_files[] = {
.file_offset = offsetof(struct pids_cgroup, events_file),
.flags = CFTYPE_NOT_ON_ROOT,
},
+ {
+ .name = "events.local",
+ .seq_show = pids_events_local_show,
+ .file_offset = offsetof(struct pids_cgroup, events_local_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+static struct cftype pids_files_legacy[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
+ .name = "events",
+ .seq_show = pids_events_show,
+ .file_offset = offsetof(struct pids_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
{ } /* terminate */
};
+
struct cgroup_subsys pids_cgrp_subsys = {
.css_alloc = pids_css_alloc,
.css_free = pids_css_free,
@@ -379,7 +468,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
.can_fork = pids_can_fork,
.cancel_fork = pids_cancel_fork,
.release = pids_release,
- .legacy_cftypes = pids_files,
+ .legacy_cftypes = pids_files_legacy,
.dfl_cftypes = pids_files,
.threaded = true,
};
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index fb8b49437573..a06b45272411 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -594,49 +594,46 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
}
}
+
+static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
+{
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time = bstat->forceidle_sum;
+
+ do_div(forceidle_time, NSEC_PER_USEC);
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
+}
+
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
- struct cgroup_base_stat bstat;
-#ifdef CONFIG_SCHED_CORE
- u64 forceidle_time;
-#endif
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
-#ifdef CONFIG_SCHED_CORE
- forceidle_time = cgrp->bstat.forceidle_sum;
-#endif
cgroup_rstat_flush_release(cgrp);
} else {
- root_cgroup_cputime(&bstat);
- usage = bstat.cputime.sum_exec_runtime;
- utime = bstat.cputime.utime;
- stime = bstat.cputime.stime;
-#ifdef CONFIG_SCHED_CORE
- forceidle_time = bstat.forceidle_sum;
-#endif
+ /* cgrp->bstat of root is not actually used, reuse it */
+ root_cgroup_cputime(&cgrp->bstat);
+ usage = cgrp->bstat.cputime.sum_exec_runtime;
+ utime = cgrp->bstat.cputime.utime;
+ stime = cgrp->bstat.cputime.stime;
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
-#ifdef CONFIG_SCHED_CORE
- do_div(forceidle_time, NSEC_PER_USEC);
-#endif
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
-#ifdef CONFIG_SCHED_CORE
- seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
-#endif
+ cgroup_force_idle_show(seq, &cgrp->bstat);
}
/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore
index 2732e0b29271..952e4448bf07 100644
--- a/tools/testing/selftests/cgroup/.gitignore
+++ b/tools/testing/selftests/cgroup/.gitignore
@@ -1,11 +1,12 @@
# SPDX-License-Identifier: GPL-2.0-only
-test_memcontrol
test_core
-test_freezer
-test_kmem
-test_kill
test_cpu
test_cpuset
-test_zswap
+test_freezer
test_hugetlb_memcg
+test_kill
+test_kmem
+test_memcontrol
+test_pids
+test_zswap
wait_inotify
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile
index 16461dc0ffdf..1b897152bab6 100644
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -6,26 +6,29 @@ all: ${HELPER_PROGS}
TEST_FILES := with_stress.sh
TEST_PROGS := test_stress.sh test_cpuset_prs.sh test_cpuset_v1_hp.sh
TEST_GEN_FILES := wait_inotify
-TEST_GEN_PROGS = test_memcontrol
-TEST_GEN_PROGS += test_kmem
-TEST_GEN_PROGS += test_core
-TEST_GEN_PROGS += test_freezer
-TEST_GEN_PROGS += test_kill
+# Keep the lists lexicographically sorted
+TEST_GEN_PROGS = test_core
TEST_GEN_PROGS += test_cpu
TEST_GEN_PROGS += test_cpuset
-TEST_GEN_PROGS += test_zswap
+TEST_GEN_PROGS += test_freezer
TEST_GEN_PROGS += test_hugetlb_memcg
+TEST_GEN_PROGS += test_kill
+TEST_GEN_PROGS += test_kmem
+TEST_GEN_PROGS += test_memcontrol
+TEST_GEN_PROGS += test_pids
+TEST_GEN_PROGS += test_zswap
LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h
include ../lib.mk
-$(OUTPUT)/test_memcontrol: cgroup_util.c
-$(OUTPUT)/test_kmem: cgroup_util.c
$(OUTPUT)/test_core: cgroup_util.c
-$(OUTPUT)/test_freezer: cgroup_util.c
-$(OUTPUT)/test_kill: cgroup_util.c
$(OUTPUT)/test_cpu: cgroup_util.c
$(OUTPUT)/test_cpuset: cgroup_util.c
-$(OUTPUT)/test_zswap: cgroup_util.c
+$(OUTPUT)/test_freezer: cgroup_util.c
$(OUTPUT)/test_hugetlb_memcg: cgroup_util.c
+$(OUTPUT)/test_kill: cgroup_util.c
+$(OUTPUT)/test_kmem: cgroup_util.c
+$(OUTPUT)/test_memcontrol: cgroup_util.c
+$(OUTPUT)/test_pids: cgroup_util.c
+$(OUTPUT)/test_zswap: cgroup_util.c
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index b5eb1be2248c..7c08cc153367 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -28,6 +28,14 @@ CPULIST=$(cat $CGROUP2/cpuset.cpus.effective)
NR_CPUS=$(lscpu | grep "^CPU(s):" | sed -e "s/.*:[[:space:]]*//")
[[ $NR_CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!"
+# Check to see if /dev/console exists and is writable
+if [[ -c /dev/console && -w /dev/console ]]
+then
+ CONSOLE=/dev/console
+else
+ CONSOLE=/dev/null
+fi
+
# Set verbose flag and delay factor
PROG=$1
VERBOSE=0
@@ -103,8 +111,8 @@ console_msg()
{
MSG=$1
echo "$MSG"
- echo "" > /dev/console
- echo "$MSG" > /dev/console
+ echo "" > $CONSOLE
+ echo "$MSG" > $CONSOLE
pause 0.01
}
@@ -161,6 +169,14 @@ test_add_proc()
# T = put a task into cgroup
# O<c>=<v> = Write <v> to CPU online file of <c>
#
+# ECPUs - effective CPUs of cpusets
+# Pstate - partition root state
+# ISOLCPUS - isolated CPUs (<icpus>[,<icpus2>])
+#
+# Note that if there are 2 fields in ISOLCPUS, the first one is for
+# sched-debug matching which includes offline CPUs and single-CPU partitions
+# while the second one is for matching cpuset.cpus.isolated.
+#
SETUP_A123_PARTITIONS="C1-3:P1:S+ C2-3:P1:S+ C3:P1"
TEST_MATRIX=(
# old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
@@ -220,23 +236,29 @@ TEST_MATRIX=(
" C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3:P2 . . 0 A1:0-1,A2:2-3,A3:2-3 A1:P0,A2:P2 2-3"
" C0-3:S+ C1-3:S+ C2-3 . X2-3 X3:P2 . . 0 A1:0-2,A2:3,A3:3 A1:P0,A2:P2 3"
" C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2 . 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3"
- " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-2,A2:1-2,A3:3 A1:P0,A3:P2 3"
+ " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3"
" C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-3,A2:1-3,A3:2-3,B1:2-3 A1:P0,A3:P0,B1:P-2"
" C0-3:S+ C1-3:S+ C2-3 C4-5 . . . P2 0 B1:4-5 B1:P2 4-5"
" C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2 0 A3:2-3,B1:4 A3:P2,B1:P2 2-4"
" C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2:C1-3 P2 0 A3:2-3,B1:4 A3:P2,B1:P2 2-4"
" C0-3:S+ C1-3:S+ C2-3 C4 X1-3 X1-3:P2 P2 . 0 A2:1,A3:2-3 A2:P2,A3:P2 1-3"
" C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2:C4-5 0 A3:2-3,B1:4-5 A3:P2,B1:P2 2-5"
+ " C4:X0-3:S+ X1-3:S+ X2-3 . . P2 . . 0 A1:4,A2:1-3,A3:1-3 A2:P2 1-3"
+ " C4:X0-3:S+ X1-3:S+ X2-3 . . . P2 . 0 A1:4,A2:4,A3:2-3 A3:P2 2-3"
# Nested remote/local partition tests
" C0-3:S+ C1-3:S+ C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4-5 \
A1:P0,A2:P1,A3:P2,B1:P1 2-3"
" C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4 \
A1:P0,A2:P1,A3:P2,B1:P1 2-4,2-3"
+ " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 . P1 0 A1:0-1,A2:2-3,A3:2-3,B1:4 \
+ A1:P0,A2:P1,A3:P0,B1:P1"
" C0-3:S+ C1-3:S+ C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:2,A3:3,B1:4 \
A1:P0,A2:P1,A3:P2,B1:P1 2-4,3"
" C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1,A2:2-3,A3:4 \
A1:P0,A2:P2,A3:P1 2-4,2-3"
+ " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X3-4:P1 . 0 A1:0-1,A2:2,A3:3-4 \
+ A1:P0,A2:P2,A3:P1 2"
" C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \
. . X5 . . 0 A1:0-4,A2:1-4,A3:2-4 \
A1:P0,A2:P-2,A3:P-1"
@@ -262,8 +284,8 @@ TEST_MATRIX=(
. . X2-3 P2 . . 0 A1:0-2,A2:3,XA2:3 A2:P2 3"
# Invalid to valid local partition direct transition tests
- " C1-3:S+:P2 C2-3:X1:P2 . . . . . . 0 A1:1-3,XA1:1-3,A2:2-3:XA2: A1:P2,A2:P-2 1-3"
- " C1-3:S+:P2 C2-3:X1:P2 . . . X3:P2 . . 0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3"
+ " C1-3:S+:P2 X4:P2 . . . . . . 0 A1:1-3,XA1:1-3,A2:1-3:XA2: A1:P2,A2:P-2 1-3"
+ " C1-3:S+:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3"
" C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4,B1:4-6 A1:P-2,B1:P0"
" C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3,B1:4-6 A1:P2,B1:P0 0-3"
" C0-3:P2 . . C3-5:C4-5 . . . . 0 A1:0-3,B1:4-5 A1:P2,B1:P0 0-3"
@@ -274,32 +296,26 @@ TEST_MATRIX=(
" C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \
. . X4 . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3"
" C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \
- . . C4 . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3"
+ . . C4:X . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3"
# Local partition CPU change tests
" C0-5:S+:P2 C4-5:S+:P1 . . . C3-5 . . 0 A1:0-2,A2:3-5 A1:P2,A2:P1 0-2"
" C0-5:S+:P2 C4-5:S+:P1 . . C1-5 . . . 0 A1:1-3,A2:4-5 A1:P2,A2:P1 1-3"
# cpus_allowed/exclusive_cpus update tests
" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
- . C4 . P2 . 0 A1:4,A2:4,XA2:,XA3:,A3:4 \
+ . X:C4 . P2 . 0 A1:4,A2:4,XA2:,XA3:,A3:4 \
A1:P0,A3:P-2"
" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
. X1 . P2 . 0 A1:0-3,A2:1-3,XA1:1,XA2:,XA3:,A3:2-3 \
A1:P0,A3:P-2"
" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
- . . C3 P2 . 0 A1:0-2,A2:0-2,XA2:3,XA3:3,A3:3 \
- A1:P0,A3:P2 3"
- " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
. . X3 P2 . 0 A1:0-2,A2:1-2,XA2:3,XA3:3,A3:3 \
A1:P0,A3:P2 3"
" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \
. . X3 . . 0 A1:0-3,A2:1-3,XA2:3,XA3:3,A3:2-3 \
A1:P0,A3:P-2"
" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \
- . . C3 . . 0 A1:0-3,A2:3,XA2:3,XA3:3,A3:3 \
- A1:P0,A3:P-2"
- " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \
- . C4 . . . 0 A1:4,A2:4,A3:4,XA1:,XA2:,XA3 \
+ . X4 . . . 0 A1:0-3,A2:1-3,A3:2-3,XA1:4,XA2:,XA3 \
A1:P0,A3:P-2"
# old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
@@ -346,6 +362,9 @@ TEST_MATRIX=(
" C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P-1,B1:P-1"
" C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P0,B1:P-1"
+ # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it
+ " C0-3 . . C4-5 X5 . . . 0 A1:0-3,B1:4-5"
+
# old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
# ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
# Failure cases:
@@ -355,6 +374,9 @@ TEST_MATRIX=(
# Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected
" C0-3 . . C4-5 X0-3 . . X3-5 1 A1:0-3,B1:4-5"
+
+ # cpuset.cpus cannot be a subset of sibling cpuset.cpus.exclusive
+ " C0-3 . . C4-5 X3-5 . . . 1 A1:0-3,B1:4-5"
)
#
@@ -556,14 +578,15 @@ check_cgroup_states()
do
set -- $(echo $CHK | sed -e "s/:/ /g")
CGRP=$1
+ CGRP_DIR=$CGRP
STATE=$2
FILE=
EVAL=$(expr substr $STATE 2 2)
- [[ $CGRP = A2 ]] && CGRP=A1/A2
- [[ $CGRP = A3 ]] && CGRP=A1/A2/A3
+ [[ $CGRP = A2 ]] && CGRP_DIR=A1/A2
+ [[ $CGRP = A3 ]] && CGRP_DIR=A1/A2/A3
case $STATE in
- P*) FILE=$CGRP/cpuset.cpus.partition
+ P*) FILE=$CGRP_DIR/cpuset.cpus.partition
;;
*) echo "Unknown state: $STATE!"
exit 1
@@ -587,6 +610,16 @@ check_cgroup_states()
;;
esac
[[ $EVAL != $VAL ]] && return 1
+
+ #
+ # For root partition, dump sched-domains info to console if
+ # verbose mode set for manual comparison with sched debug info.
+ #
+ [[ $VAL -eq 1 && $VERBOSE -gt 0 ]] && {
+ DOMS=$(cat $CGRP_DIR/cpuset.cpus.effective)
+ [[ -n "$DOMS" ]] &&
+ echo " [$CGRP] sched-domain: $DOMS" > $CONSOLE
+ }
done
return 0
}
@@ -694,9 +727,9 @@ null_isolcpus_check()
[[ $VERBOSE -gt 0 ]] || return 0
# Retry a few times before printing error
RETRY=0
- while [[ $RETRY -lt 5 ]]
+ while [[ $RETRY -lt 8 ]]
do
- pause 0.01
+ pause 0.02
check_isolcpus "."
[[ $? -eq 0 ]] && return 0
((RETRY++))
@@ -726,7 +759,7 @@ run_state_test()
while [[ $I -lt $CNT ]]
do
- echo "Running test $I ..." > /dev/console
+ echo "Running test $I ..." > $CONSOLE
[[ $VERBOSE -gt 1 ]] && {
echo ""
eval echo \${$TEST[$I]}
@@ -783,7 +816,7 @@ run_state_test()
while [[ $NEWLIST != $CPULIST && $RETRY -lt 8 ]]
do
# Wait a bit longer & recheck a few times
- pause 0.01
+ pause 0.02
((RETRY++))
NEWLIST=$(cat cpuset.cpus.effective)
done
diff --git a/tools/testing/selftests/cgroup/test_pids.c b/tools/testing/selftests/cgroup/test_pids.c
new file mode 100644
index 000000000000..9ecb83c6cc5c
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_pids.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <linux/limits.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+static int run_success(const char *cgroup, void *arg)
+{
+ return 0;
+}
+
+static int run_pause(const char *cgroup, void *arg)
+{
+ return pause();
+}
+
+/*
+ * This test checks that pids.max prevents forking new children above the
+ * specified limit in the cgroup.
+ */
+static int test_pids_max(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg_pids;
+ int pid;
+
+ cg_pids = cg_name(root, "pids_test");
+ if (!cg_pids)
+ goto cleanup;
+
+ if (cg_create(cg_pids))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_pids, "pids.max", "max\n"))
+ goto cleanup;
+
+ if (cg_write(cg_pids, "pids.max", "2"))
+ goto cleanup;
+
+ if (cg_enter_current(cg_pids))
+ goto cleanup;
+
+ pid = cg_run_nowait(cg_pids, run_pause, NULL);
+ if (pid < 0)
+ goto cleanup;
+
+ if (cg_run_nowait(cg_pids, run_success, NULL) != -1 || errno != EAGAIN)
+ goto cleanup;
+
+ if (kill(pid, SIGINT))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ cg_destroy(cg_pids);
+ free(cg_pids);
+
+ return ret;
+}
+
+/*
+ * This test checks that pids.events are counted in cgroup associated with pids.max
+ */
+static int test_pids_events(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg_parent = NULL, *cg_child = NULL;
+ int pid;
+
+ cg_parent = cg_name(root, "pids_parent");
+ cg_child = cg_name(cg_parent, "pids_child");
+ if (!cg_parent || !cg_child)
+ goto cleanup;
+
+ if (cg_create(cg_parent))
+ goto cleanup;
+ if (cg_write(cg_parent, "cgroup.subtree_control", "+pids"))
+ goto cleanup;
+ if (cg_create(cg_child))
+ goto cleanup;
+
+ if (cg_write(cg_parent, "pids.max", "2"))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_child, "pids.max", "max\n"))
+ goto cleanup;
+
+ if (cg_enter_current(cg_child))
+ goto cleanup;
+
+ pid = cg_run_nowait(cg_child, run_pause, NULL);
+ if (pid < 0)
+ goto cleanup;
+
+ if (cg_run_nowait(cg_child, run_success, NULL) != -1 || errno != EAGAIN)
+ goto cleanup;
+
+ if (kill(pid, SIGINT))
+ goto cleanup;
+
+ if (cg_read_key_long(cg_child, "pids.events", "max ") != 0)
+ goto cleanup;
+ if (cg_read_key_long(cg_parent, "pids.events", "max ") != 1)
+ goto cleanup;
+
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ if (cg_child)
+ cg_destroy(cg_child);
+ if (cg_parent)
+ cg_destroy(cg_parent);
+ free(cg_child);
+ free(cg_parent);
+
+ return ret;
+}
+
+
+
+#define T(x) { x, #x }
+struct pids_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_pids_max),
+ T(test_pids_events),
+};
+#undef T
+
+int main(int argc, char **argv)
+{
+ char root[PATH_MAX];
+
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
+ if (cg_find_unified_root(root, sizeof(root), NULL))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ /*
+ * Check that pids controller is available:
+ * pids is listed in cgroup.controllers
+ */
+ if (cg_read_strstr(root, "cgroup.controllers", "pids"))
+ ksft_exit_skip("pids controller isn't available\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "pids"))
+ if (cg_write(root, "cgroup.subtree_control", "+pids"))
+ ksft_exit_skip("Failed to set pids controller\n");
+
+ for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ ksft_finished();
+}