From cfe7ddcbd72dc67ce5749cc6f451a2b0c6aec5b5 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:47 +0100 Subject: ARM, sched/topology: Remove SD_SHARE_POWERDOMAIN This flag was introduced in 2014 by commit: d77b3ed5c9f8 ("sched: Add a new SD_SHARE_POWERDOMAIN for sched_domain") but AFAIA it was never leveraged by the scheduler. The closest thing I can think of is EAS caring about frequency domains, and it does that by leveraging performance domains. Remove the flag. No change in functionality is expected. Suggested-by: Morten Rasmussen Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-2-valentin.schneider@arm.com --- include/linux/sched/topology.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 820511289857..6ec7d7c1d1e3 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -18,13 +18,12 @@ #define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */ #define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */ #define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */ -#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */ -#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */ -#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */ -#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */ -#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */ -#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */ -#define SD_NUMA 0x2000 /* cross-node balancing */ +#define SD_SHARE_PKG_RESOURCES 0x0080 /* Domain members share CPU pkg resources */ +#define SD_SERIALIZE 0x0100 /* Only a single load balancing instance */ +#define SD_ASYM_PACKING 0x0200 /* Place busy groups earlier in the domain */ +#define SD_PREFER_SIBLING 0x0400 /* Prefer to place tasks in a sibling domain */ +#define SD_OVERLAP 0x0800 /* sched_domains of this level overlap */ +#define SD_NUMA 0x1000 /* cross-node balancing */ #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) -- cgit From d54a9658a75633b839af7a2c6c758807678b8064 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:49 +0100 Subject: sched/topology: Split out SD_* flags declaration to its own file To associate the SD flags with some metadata, we need some more structure in the way they are declared. Rather than shove that in a free-standing macro list, move the declaration in a separate file that can be re-imported with different SD_FLAG definitions. This is inspired by what is done with the syscall table (see uapi/asm/unistd.h and sys_call_table). The value assigned to a given SD flag now depends on the order it appears in sd_flags.h. No change in functionality. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-4-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 35 +++++++++++++++++++++++++++++++++++ include/linux/sched/topology.h | 26 +++++++++++++------------- 2 files changed, 48 insertions(+), 13 deletions(-) create mode 100644 include/linux/sched/sd_flags.h (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h new file mode 100644 index 000000000000..373dc45c024e --- /dev/null +++ b/include/linux/sched/sd_flags.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sched-domains (multiprocessor balancing) flag declarations. + */ + +#ifndef SD_FLAG +# error "Incorrect import of SD flags definitions" +#endif + +/* Balance when about to become idle */ +SD_FLAG(SD_BALANCE_NEWIDLE) +/* Balance on exec */ +SD_FLAG(SD_BALANCE_EXEC) +/* Balance on fork, clone */ +SD_FLAG(SD_BALANCE_FORK) +/* Balance on wakeup */ +SD_FLAG(SD_BALANCE_WAKE) +/* Wake task to waking CPU */ +SD_FLAG(SD_WAKE_AFFINE) +/* Domain members have different CPU capacities */ +SD_FLAG(SD_ASYM_CPUCAPACITY) +/* Domain members share CPU capacity */ +SD_FLAG(SD_SHARE_CPUCAPACITY) +/* Domain members share CPU pkg resources */ +SD_FLAG(SD_SHARE_PKG_RESOURCES) +/* Only a single load balancing instance */ +SD_FLAG(SD_SERIALIZE) +/* Place busy groups earlier in the domain */ +SD_FLAG(SD_ASYM_PACKING) +/* Prefer to place tasks in a sibling domain */ +SD_FLAG(SD_PREFER_SIBLING) +/* sched_domains of this level overlap */ +SD_FLAG(SD_OVERLAP) +/* cross-node balancing */ +SD_FLAG(SD_NUMA) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 6ec7d7c1d1e3..3e41c0401b5f 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -11,19 +11,19 @@ */ #ifdef CONFIG_SMP -#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */ -#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */ -#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */ -#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */ -#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */ -#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */ -#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */ -#define SD_SHARE_PKG_RESOURCES 0x0080 /* Domain members share CPU pkg resources */ -#define SD_SERIALIZE 0x0100 /* Only a single load balancing instance */ -#define SD_ASYM_PACKING 0x0200 /* Place busy groups earlier in the domain */ -#define SD_PREFER_SIBLING 0x0400 /* Prefer to place tasks in a sibling domain */ -#define SD_OVERLAP 0x0800 /* sched_domains of this level overlap */ -#define SD_NUMA 0x1000 /* cross-node balancing */ +/* Generate SD flag indexes */ +#define SD_FLAG(name) __##name, +enum { + #include + __SD_FLAG_CNT, +}; +#undef SD_FLAG +/* Generate SD flag bits */ +#define SD_FLAG(name) name = 1 << __##name, +enum { + #include +}; +#undef SD_FLAG #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) -- cgit From b6e862f386722e0de6c37f85f1cf438a0efa7f93 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:50 +0100 Subject: sched/topology: Define and assign sched_domain flag metadata There are some expectations regarding how sched domain flags should be laid out, but none of them are checked or asserted in sched_domain_debug_one(). After staring at said flags for a while, I've come to realize there's two repeating patterns: - Shared with children: those flags are set from the base CPU domain upwards. Any domain that has it set will have it set in its children. It hints at "some property holds true / some behaviour is enabled until this level". - Shared with parents: those flags are set from the topmost domain downwards. Any domain that has it set will have it set in its parents. It hints at "some property isn't visible / some behaviour is disabled until this level". There are two outliers that (currently) do not map to either of these: o SD_PREFER_SIBLING, which is cleared below levels with SD_ASYM_CPUCAPACITY. The change was introduced by commit: 9c63e84db29b ("sched/core: Disable SD_PREFER_SIBLING on asymmetric CPU capacity domains") as it could break misfit migration on some systems. In light of this, we might want to change it back to make it fit one of the two categories and fix the issue another way. o SD_ASYM_CPUCAPACITY, which gets set on a single level and isn't propagated up nor down. From a topology description point of view, it really wants to be SDF_SHARED_PARENT; this will be rectified in a later patch. Tweak the sched_domain flag declaration to assign each flag an expected layout, and include the rationale for each flag "meta type" assignment as a comment. Consolidate the flag metadata into an array; the index of a flag's metadata can easily be found with log2(flag), IOW __ffs(flag). Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-5-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 147 +++++++++++++++++++++++++++++++++-------- include/linux/sched/topology.h | 15 ++++- 2 files changed, 134 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 373dc45c024e..c24a45b05fbb 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -7,29 +7,124 @@ # error "Incorrect import of SD flags definitions" #endif -/* Balance when about to become idle */ -SD_FLAG(SD_BALANCE_NEWIDLE) -/* Balance on exec */ -SD_FLAG(SD_BALANCE_EXEC) -/* Balance on fork, clone */ -SD_FLAG(SD_BALANCE_FORK) -/* Balance on wakeup */ -SD_FLAG(SD_BALANCE_WAKE) -/* Wake task to waking CPU */ -SD_FLAG(SD_WAKE_AFFINE) -/* Domain members have different CPU capacities */ -SD_FLAG(SD_ASYM_CPUCAPACITY) -/* Domain members share CPU capacity */ -SD_FLAG(SD_SHARE_CPUCAPACITY) -/* Domain members share CPU pkg resources */ -SD_FLAG(SD_SHARE_PKG_RESOURCES) -/* Only a single load balancing instance */ -SD_FLAG(SD_SERIALIZE) -/* Place busy groups earlier in the domain */ -SD_FLAG(SD_ASYM_PACKING) -/* Prefer to place tasks in a sibling domain */ -SD_FLAG(SD_PREFER_SIBLING) -/* sched_domains of this level overlap */ -SD_FLAG(SD_OVERLAP) -/* cross-node balancing */ -SD_FLAG(SD_NUMA) +/* + * Expected flag uses + * + * SHARED_CHILD: These flags are meant to be set from the base domain upwards. + * If a domain has this flag set, all of its children should have it set. This + * is usually because the flag describes some shared resource (all CPUs in that + * domain share the same resource), or because they are tied to a scheduling + * behaviour that we want to disable at some point in the hierarchy for + * scalability reasons. + * + * In those cases it doesn't make sense to have the flag set for a domain but + * not have it in (some of) its children: sched domains ALWAYS span their child + * domains, so operations done with parent domains will cover CPUs in the lower + * child domains. + * + * + * SHARED_PARENT: These flags are meant to be set from the highest domain + * downwards. If a domain has this flag set, all of its parents should have it + * set. This is usually for topology properties that start to appear above a + * certain level (e.g. domain starts spanning CPUs outside of the base CPU's + * socket). + */ +#define SDF_SHARED_CHILD 0x1 +#define SDF_SHARED_PARENT 0x2 + +/* + * Balance when about to become idle + * + * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level. + */ +SD_FLAG(SD_BALANCE_NEWIDLE, SDF_SHARED_CHILD) + +/* + * Balance on exec + * + * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level. + */ +SD_FLAG(SD_BALANCE_EXEC, SDF_SHARED_CHILD) + +/* + * Balance on fork, clone + * + * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level. + */ +SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD) + +/* + * Balance on wakeup + * + * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level. + */ +SD_FLAG(SD_BALANCE_WAKE, SDF_SHARED_CHILD) + +/* + * Consider waking task on waking CPU. + * + * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level. + */ +SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD) + +/* + * Domain members have different CPU capacities + */ +SD_FLAG(SD_ASYM_CPUCAPACITY, 0) + +/* + * Domain members share CPU capacity (i.e. SMT) + * + * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share + * CPU capacity. + */ +SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD) + +/* + * Domain members share CPU package resources (i.e. caches) + * + * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share + * the same cache(s). + */ +SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD) + +/* + * Only a single load balancing instance + * + * SHARED_PARENT: Set for all NUMA levels above NODE. Could be set from a + * different level upwards, but it doesn't change that if a domain has this flag + * set, then all of its parents need to have it too (otherwise the serialization + * doesn't make sense). + */ +SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT) + +/* + * Place busy tasks earlier in the domain + * + * SHARED_CHILD: Usually set on the SMT level. Technically could be set further + * up, but currently assumed to be set from the base domain upwards (see + * update_top_cache_domain()). + */ +SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD) + +/* + * Prefer to place tasks in a sibling domain + * + * Set up until domains start spanning NUMA nodes. Close to being a SHARED_CHILD + * flag, but cleared below domains with SD_ASYM_CPUCAPACITY. + */ +SD_FLAG(SD_PREFER_SIBLING, 0) + +/* + * sched_groups of this level overlap + * + * SHARED_PARENT: Set for all NUMA levels above NODE. + */ +SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT) + +/* + * Cross-node balancing + * + * SHARED_PARENT: Set for all NUMA levels above NODE. + */ +SD_FLAG(SD_NUMA, SDF_SHARED_PARENT) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 3e41c0401b5f..32f602ff37a0 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -12,19 +12,30 @@ #ifdef CONFIG_SMP /* Generate SD flag indexes */ -#define SD_FLAG(name) __##name, +#define SD_FLAG(name, mflags) __##name, enum { #include __SD_FLAG_CNT, }; #undef SD_FLAG /* Generate SD flag bits */ -#define SD_FLAG(name) name = 1 << __##name, +#define SD_FLAG(name, mflags) name = 1 << __##name, enum { #include }; #undef SD_FLAG +#ifdef CONFIG_SCHED_DEBUG +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, +static const struct { + unsigned int meta_flags; + char *name; +} sd_flag_debug[] = { +#include +}; +#undef SD_FLAG +#endif + #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { -- cgit From 4ee4ea443a5dc3fc4d8ae338199676eae9d8ef02 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:53 +0100 Subject: sched/topology: Introduce SD metaflag for flags needing > 1 groups In preparation of cleaning up the sd_degenerate*() functions, mark flags used in sd_degenerate() with the new SDF_NEEDS_GROUPS flag. With this, build a compile-time mask of those SD flags. Note that sd_parent_degenerate() uses an extra flag in its mask, SD_PREFER_SIBLING, which remains singled out for now. Suggested-by: Peter Zijlstra Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-8-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 39 ++++++++++++++++++++++++++++----------- include/linux/sched/topology.h | 7 +++++++ 2 files changed, 35 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index c24a45b05fbb..ee5cbfc7189f 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -8,7 +8,7 @@ #endif /* - * Expected flag uses + * Hierarchical metaflags * * SHARED_CHILD: These flags are meant to be set from the base domain upwards. * If a domain has this flag set, all of its children should have it set. This @@ -29,29 +29,42 @@ * certain level (e.g. domain starts spanning CPUs outside of the base CPU's * socket). */ -#define SDF_SHARED_CHILD 0x1 -#define SDF_SHARED_PARENT 0x2 +#define SDF_SHARED_CHILD 0x1 +#define SDF_SHARED_PARENT 0x2 + +/* + * Behavioural metaflags + * + * NEEDS_GROUPS: These flags are only relevant if the domain they are set on has + * more than one group. This is usually for balancing flags (load balancing + * involves equalizing a metric between groups), or for flags describing some + * shared resource (which would be shared between groups). + */ +#define SDF_NEEDS_GROUPS 0x4 /* * Balance when about to become idle * * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level. + * NEEDS_GROUPS: Load balancing flag. */ -SD_FLAG(SD_BALANCE_NEWIDLE, SDF_SHARED_CHILD) +SD_FLAG(SD_BALANCE_NEWIDLE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* * Balance on exec * * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level. + * NEEDS_GROUPS: Load balancing flag. */ -SD_FLAG(SD_BALANCE_EXEC, SDF_SHARED_CHILD) +SD_FLAG(SD_BALANCE_EXEC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* * Balance on fork, clone * * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level. + * NEEDS_GROUPS: Load balancing flag. */ -SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD) +SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* * Balance on wakeup @@ -69,24 +82,28 @@ SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD) /* * Domain members have different CPU capacities + * + * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups. */ -SD_FLAG(SD_ASYM_CPUCAPACITY, 0) +SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_NEEDS_GROUPS) /* * Domain members share CPU capacity (i.e. SMT) * * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share - * CPU capacity. + * CPU capacity. + * NEEDS_GROUPS: Capacity is shared between groups. */ -SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD) +SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* * Domain members share CPU package resources (i.e. caches) * * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share - * the same cache(s). + * the same cache(s). + * NEEDS_GROUPS: Caches are shared between groups. */ -SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD) +SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* * Only a single load balancing instance diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 32f602ff37a0..2d59ca77103e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -25,6 +25,13 @@ enum { }; #undef SD_FLAG +/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | +static const unsigned int SD_DEGENERATE_GROUPS_MASK = +#include +0; +#undef SD_FLAG + #ifdef CONFIG_SCHED_DEBUG #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, static const struct { -- cgit From c200191d4c2c1aa2ffb62a984b756ac1f02dc55c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:56 +0100 Subject: sched/topology: Propagate SD_ASYM_CPUCAPACITY upwards We currently set this flag *only* on domains whose topology level exactly match the level where we detect asymmetry (as returned by asym_cpu_capacity_level()). This is rather problematic. Say there are two clusters in the system, one with a lone big CPU and the other with a mix of big and LITTLE CPUs (as is allowed by DynamIQ): DIE [ ] MC [ ][ ] 0 1 2 3 4 L L B B B asym_cpu_capacity_level() will figure out that the MC level is the one where all CPUs can see a CPU of max capacity, and we will thus set SD_ASYM_CPUCAPACITY at MC level for all CPUs. That lone big CPU will degenerate its MC domain, since it would be alone in there, and will end up with just a DIE domain. Since the flag was only set at MC, this CPU ends up not seeing any SD with the flag set, which is broken. Rather than clearing dflags at every topology level, clear it before entering the topology level loop. This will properly propagate upwards flags that are set starting from a certain level. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Quentin Perret Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-11-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 4 +++- kernel/sched/topology.c | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index ee5cbfc7189f..40ad0d5c32e3 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -83,9 +83,11 @@ SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD) /* * Domain members have different CPU capacities * + * SHARED_PARENT: Set from the topmost domain down to the first domain where + * asymmetry is detected. * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups. */ -SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_NEEDS_GROUPS) +SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) /* * Domain members share CPU capacity (i.e. SMT) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c662c53736e2..f36ed96f3197 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1988,11 +1988,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; + int dflags = 0; sd = NULL; for_each_sd_topology(tl) { - int dflags = 0; - if (tl == tl_asym) { dflags |= SD_ASYM_CPUCAPACITY; has_asym = true; -- cgit From 3a6712c7685352a5d71eaf459a4fddfc3589f018 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:57 +0100 Subject: sched/topology: Mark SD_PREFER_SIBLING as SDF_NEEDS_GROUPS SD_PREFER_SIBLING is currently considered in sd_parent_degenerate() but not in sd_degenerate(). It too hinges on load balancing, and thus won't have any effect when set on a domain with a single group. Add it to SD_DEGENERATE_GROUPS_MASK. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-12-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 4 +++- kernel/sched/topology.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 40ad0d5c32e3..d28fe67c3098 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -131,8 +131,10 @@ SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD) * * Set up until domains start spanning NUMA nodes. Close to being a SHARED_CHILD * flag, but cleared below domains with SD_ASYM_CPUCAPACITY. + * + * NEEDS_GROUPS: Load balancing flag. */ -SD_FLAG(SD_PREFER_SIBLING, 0) +SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS) /* * sched_groups of this level overlap diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f36ed96f3197..c674aaab312c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -184,7 +184,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) - pflags &= ~(SD_DEGENERATE_GROUPS_MASK | SD_PREFER_SIBLING); + pflags &= ~SD_DEGENERATE_GROUPS_MASK; if (~cflags & pflags) return 0; -- cgit From 94b858fea1f2246a2fb7f7af21840fd14ced028f Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:58 +0100 Subject: sched/topology: Mark SD_BALANCE_WAKE as SDF_NEEDS_GROUPS Even if no mainline topology uses this flag, it is a load balancing flag just like SD_BALANCE_FORK and requires 2+ groups to have any effect. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-13-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index d28fe67c3098..729510a291b2 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -70,8 +70,9 @@ SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) * Balance on wakeup * * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level. + * NEEDS_GROUPS: Load balancing flag. */ -SD_FLAG(SD_BALANCE_WAKE, SDF_SHARED_CHILD) +SD_FLAG(SD_BALANCE_WAKE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* * Consider waking task on waking CPU. -- cgit From bdb7c802cc0a7e21f5223dc3ce41b7ac220c576e Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:59 +0100 Subject: sched/topology: Mark SD_SERIALIZE as SDF_NEEDS_GROUPS There would be no point in preserving a sched_domain with a single group just because it has this flag set. Add it to SD_DEGENERATE_GROUPS_MASK. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-14-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 729510a291b2..b7f4d80e338e 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -112,11 +112,12 @@ SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) * Only a single load balancing instance * * SHARED_PARENT: Set for all NUMA levels above NODE. Could be set from a - * different level upwards, but it doesn't change that if a domain has this flag - * set, then all of its parents need to have it too (otherwise the serialization - * doesn't make sense). + * different level upwards, but it doesn't change that if a + * domain has this flag set, then all of its parents need to have + * it too (otherwise the serialization doesn't make sense). + * NEEDS_GROUPS: No point in preserving domain if it has a single group. */ -SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT) +SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) /* * Place busy tasks earlier in the domain -- cgit From 33199b0143daf4778d6301f966cb914d75f122eb Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:30:00 +0100 Subject: sched/topology: Mark SD_ASYM_PACKING as SDF_NEEDS_GROUPS Being a load-balancing flag, it requires 2+ groups to have any effect. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-15-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index b7f4d80e338e..2998ece2c18d 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -123,10 +123,11 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) * Place busy tasks earlier in the domain * * SHARED_CHILD: Usually set on the SMT level. Technically could be set further - * up, but currently assumed to be set from the base domain upwards (see - * update_top_cache_domain()). + * up, but currently assumed to be set from the base domain + * upwards (see update_top_cache_domain()). + * NEEDS_GROUPS: Load balancing flag. */ -SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD) +SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* * Prefer to place tasks in a sibling domain -- cgit From 3551e954f5d95faf3dbc340d422da7624658c230 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:30:01 +0100 Subject: sched/topology: Mark SD_OVERLAP as SDF_NEEDS_GROUPS A sched_domain can only have overlapping sched_groups if it has more than one group. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-16-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 2998ece2c18d..29af5f032861 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -143,8 +143,9 @@ SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS) * sched_groups of this level overlap * * SHARED_PARENT: Set for all NUMA levels above NODE. + * NEEDS_GROUPS: Overlaps can only exist with more than one group. */ -SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT) +SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) /* * Cross-node balancing -- cgit From 5f4a1c4ea44728aa80be21dbf3a0469b5ca81d88 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:30:02 +0100 Subject: sched/topology: Mark SD_NUMA as SDF_NEEDS_GROUPS There would be no point in preserving a sched_domain with a single group just because it has this flag set. Add it to SD_DEGENERATE_GROUPS_MASK. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-17-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 29af5f032861..34b21e971d77 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -151,5 +151,6 @@ SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) * Cross-node balancing * * SHARED_PARENT: Set for all NUMA levels above NODE. + * NEEDS_GROUPS: No point in preserving domain if it has a single group. */ -SD_FLAG(SD_NUMA, SDF_SHARED_PARENT) +SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) -- cgit From 01ccf592362a984534371b3596d4c953da6a7bb2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 19 Aug 2020 21:55:05 +0200 Subject: sched: Bring the PF_IO_WORKER and PF_WQ_WORKER bits closer together The bits PF_IO_WORKER and PF_WQ_WORKER are tested together in sched_submit_work() which is considered to be a hot path. If the two bits cross the 8 or 16 bit boundary then most architecture require multiple load instructions in order to create the constant value. Also, such a value can not be encoded within the compare opcode. By moving the bit definition within the same block, the compiler can create/use one immediate value. For some reason gcc-10 on ARM64 requires both bits to be next to each other in order to issue "tst reg, val; bne label". Otherwise the result is "mov reg1, val; tst reg, reg1; bne label". Move PF_VCPU out of the way so that PF_IO_WORKER can be next to PF_WQ_WORKER. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200819195505.y3fxk72sotnrkczi@linutronix.de --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 93ecd930efd3..2bf0af19a62a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1489,9 +1489,10 @@ extern struct pid *cad_pid; /* * Per process flags */ +#define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ -#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ @@ -1515,7 +1516,6 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ -#define PF_IO_WORKER 0x20000000 /* Task is an IO worker */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ -- cgit From 8fca9494d4b4d6b57b1398cd473feb308df656db Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 25 Aug 2020 14:32:15 +0100 Subject: sched/topology: Move sd_flag_debug out of linux/sched/topology.h Defining an array in a header imported all over the place clearly is a daft idea, that still didn't stop me from doing it. Leave a declaration of sd_flag_debug in topology.h and move its definition to sched/debug.c. Fixes: b6e862f38672 ("sched/topology: Define and assign sched_domain flag metadata") Reported-by: Andy Shevchenko Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200825133216.9163-1-valentin.schneider@arm.com --- include/linux/sched/topology.h | 9 ++++----- kernel/sched/debug.c | 6 ++++++ 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 2d59ca77103e..b9b0dab4d067 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -33,14 +33,13 @@ static const unsigned int SD_DEGENERATE_GROUPS_MASK = #undef SD_FLAG #ifdef CONFIG_SCHED_DEBUG -#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, -static const struct { + +struct sd_flag_debug { unsigned int meta_flags; char *name; -} sd_flag_debug[] = { -#include }; -#undef SD_FLAG +extern const struct sd_flag_debug sd_flag_debug[]; + #endif #ifdef CONFIG_SCHED_SMT diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0655524700d2..0d7896d2a0b2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,6 +245,12 @@ set_table_entry(struct ctl_table *entry, entry->proc_handler = proc_handler; } +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, +const struct sd_flag_debug sd_flag_debug[] = { +#include +}; +#undef SD_FLAG + static int sd_ctl_doflags(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { -- cgit From 4fc472f1214ef75e5450f207e23ff13af6eecad4 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 25 Aug 2020 14:32:16 +0100 Subject: sched/topology: Move SD_DEGENERATE_GROUPS_MASK out of linux/sched/topology.h SD_DEGENERATE_GROUPS_MASK is only useful for sched/topology.c, but still gets defined for anyone who imports topology.h, leading to a flurry of unused variable warnings. Move it out of the header and place it next to the SD degeneration functions in sched/topology.c. Fixes: 4ee4ea443a5d ("sched/topology: Introduce SD metaflag for flags needing > 1 groups") Reported-by: Andy Shevchenko Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200825133216.9163-2-valentin.schneider@arm.com --- include/linux/sched/topology.h | 7 ------- kernel/sched/topology.c | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index b9b0dab4d067..9ef7bf686a9f 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -25,13 +25,6 @@ enum { }; #undef SD_FLAG -/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ -#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | -static const unsigned int SD_DEGENERATE_GROUPS_MASK = -#include -0; -#undef SD_FLAG - #ifdef CONFIG_SCHED_DEBUG struct sd_flag_debug { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c674aaab312c..aa1676abc544 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -154,6 +154,13 @@ static inline bool sched_debug(void) } #endif /* CONFIG_SCHED_DEBUG */ +/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | +static const unsigned int SD_DEGENERATE_GROUPS_MASK = +#include +0; +#undef SD_FLAG + static int sd_degenerate(struct sched_domain *sd) { if (cpumask_weight(sched_domain_span(sd)) == 1) -- cgit From 2a36ab717e8fe678d98f81c14a0b124712719840 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 23 Sep 2020 16:36:16 -0700 Subject: rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ This patchset is based on Google-internal RSEQ work done by Paul Turner and Andrew Hunter. When working with per-CPU RSEQ-based memory allocations, it is sometimes important to make sure that a global memory location is no longer accessed from RSEQ critical sections. For example, there can be two per-CPU lists, one is "active" and accessed per-CPU, while another one is inactive and worked on asynchronously "off CPU" (e.g. garbage collection is performed). Then at some point the two lists are swapped, and a fast RCU-like mechanism is required to make sure that the previously active list is no longer accessed. This patch introduces such a mechanism: in short, membarrier() syscall issues an IPI to a CPU, restarting a potentially active RSEQ critical section on the CPU. Signed-off-by: Peter Oskolkov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mathieu Desnoyers Link: https://lkml.kernel.org/r/20200923233618.2572849-1-posk@google.com --- include/linux/sched/mm.h | 3 + include/linux/syscalls.h | 2 +- include/uapi/linux/membarrier.h | 26 ++++++++ kernel/sched/membarrier.c | 136 +++++++++++++++++++++++++++++++--------- 4 files changed, 136 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index f889e332912f..15bfb06f2884 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -348,10 +348,13 @@ enum { MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7), }; enum { MEMBARRIER_FLAG_SYNC_CORE = (1U << 0), + MEMBARRIER_FLAG_RSEQ = (1U << 1), }; #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 75ac7f8ae93c..06db09875aa4 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); asmlinkage long sys_userfaultfd(int flags); -asmlinkage long sys_membarrier(int cmd, int flags); +asmlinkage long sys_membarrier(int cmd, unsigned int flags, int cpu_id); asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h index 5891d7614c8c..737605897f36 100644 --- a/include/uapi/linux/membarrier.h +++ b/include/uapi/linux/membarrier.h @@ -114,6 +114,26 @@ * If this command is not implemented by an * architecture, -EINVAL is returned. * Returns 0 on success. + * @MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + * Ensure the caller thread, upon return from + * system call, that all its running thread + * siblings have any currently running rseq + * critical sections restarted if @flags + * parameter is 0; if @flags parameter is + * MEMBARRIER_CMD_FLAG_CPU, + * then this operation is performed only + * on CPU indicated by @cpu_id. If this command is + * not implemented by an architecture, -EINVAL + * is returned. A process needs to register its + * intent to use the private expedited rseq + * command prior to using it, otherwise + * this command returns -EPERM. + * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + * Register the process intent to use + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ. + * If this command is not implemented by an + * architecture, -EINVAL is returned. + * Returns 0 on success. * @MEMBARRIER_CMD_SHARED: * Alias to MEMBARRIER_CMD_GLOBAL. Provided for * header backward compatibility. @@ -131,9 +151,15 @@ enum membarrier_cmd { MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5), MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6), + MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = (1 << 7), + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8), /* Alias for header backward compatibility. */ MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL, }; +enum membarrier_cmd_flag { + MEMBARRIER_CMD_FLAG_CPU = (1 << 0), +}; + #endif /* _UAPI_LINUX_MEMBARRIER_H */ diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 168479a7d61b..e23e74d52db5 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,14 @@ #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif +#ifdef CONFIG_RSEQ +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) +#else +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ @@ -30,6 +38,11 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_rseq(void *info) +{ + rseq_preempt(current); +} + static void ipi_sync_rq_state(void *info) { struct mm_struct *mm = (struct mm_struct *) info; @@ -129,19 +142,27 @@ static int membarrier_global_expedited(void) return 0; } -static int membarrier_private_expedited(int flags) +static int membarrier_private_expedited(int flags, int cpu_id) { - int cpu; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; + smp_call_func_t ipi_func = ipi_mb; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + if (!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY)) + return -EPERM; + ipi_func = ipi_rseq; } else { + WARN_ON_ONCE(flags); if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; @@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; cpus_read_lock(); - rcu_read_lock(); - for_each_online_cpu(cpu) { + + if (cpu_id >= 0) { struct task_struct *p; - /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; - p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) - __cpumask_set_cpu(cpu, tmpmask); + if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) + goto out; + if (cpu_id == raw_smp_processor_id()) + goto out; + rcu_read_lock(); + p = rcu_dereference(cpu_rq(cpu_id)->curr); + if (!p || p->mm != mm) { + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } else { + int cpu; + + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); } - rcu_read_unlock(); preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + if (cpu_id >= 0) + smp_call_function_single(cpu_id, ipi_func, NULL, 1); + else + smp_call_function_many(tmpmask, ipi_func, NULL, 1); preempt_enable(); - free_cpumask_var(tmpmask); +out: + if (cpu_id < 0) + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags) set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, ret; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY; + } else { + WARN_ON_ONCE(flags); } /* @@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags) return 0; if (flags & MEMBARRIER_FLAG_SYNC_CORE) set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + if (flags & MEMBARRIER_FLAG_RSEQ) + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ; atomic_or(set_state, &mm->membarrier_state); ret = sync_runqueues_membarrier_state(mm); if (ret) @@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags) /** * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0 for all commands other than + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter + * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id + * contains the CPU on which to interrupt (= restart) + * the RSEQ critical section. + * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which + * RSEQ CS should be interrupted (@cmd must be + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ). * * If this system call is not implemented, -ENOSYS is returned. If the * command specified does not exist, not available on the running @@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags) * smp_mb() X O O * sys_membarrier() O O O */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) { - if (unlikely(flags)) - return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU)) + return -EINVAL; + break; + default: + if (unlikely(flags)) + return -EINVAL; + } + + if (!(flags & MEMBARRIER_CMD_FLAG_CPU)) + cpu_id = -1; + switch (cmd) { case MEMBARRIER_CMD_QUERY: { @@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(0); + return membarrier_private_expedited(0, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: return membarrier_register_private_expedited(0); case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: - return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); default: return -EINVAL; } -- cgit From 51cf18c90ca1b51d1cb4af3064e85fcf8610b5d2 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 28 Aug 2020 10:00:49 +0100 Subject: sched/debug: Add new tracepoint to track cpu_capacity rq->cpu_capacity is a key element in several scheduler parts, such as EAS task placement and load balancing. Tracking this value enables testing and/or debugging by a toolkit. Signed-off-by: Vincent Donnefort Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1598605249-72651-1-git-send-email-vincent.donnefort@arm.com --- include/linux/sched.h | 1 + include/trace/events/sched.h | 4 ++++ kernel/sched/core.c | 1 + kernel/sched/fair.c | 14 ++++++++++++++ 4 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2bf0af19a62a..f516c18eeb20 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2044,6 +2044,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); int sched_trace_rq_cpu(struct rq *rq); +int sched_trace_rq_cpu_capacity(struct rq *rq); int sched_trace_rq_nr_running(struct rq *rq); const struct cpumask *sched_trace_rd_span(struct root_domain *rd); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index fec25b9cfbaf..c96a4337afe6 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -630,6 +630,10 @@ DECLARE_TRACE(pelt_se_tp, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); +DECLARE_TRACE(sched_cpu_capacity_tp, + TP_PROTO(struct rq *rq), + TP_ARGS(rq)); + DECLARE_TRACE(sched_overutilized_tp, TP_PROTO(struct root_domain *rd, bool overutilized), TP_ARGS(rd, overutilized)); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd32d859ab40..3dc415f58bd7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cec6cf9b2bb3..aa4c6227cd6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8108,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity = 1; cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); + sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; sdg->sgc->max_capacity = capacity; @@ -11321,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq) } EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); +int sched_trace_rq_cpu_capacity(struct rq *rq) +{ + return rq ? +#ifdef CONFIG_SMP + rq->cpu_capacity +#else + SCHED_CAPACITY_SCALE +#endif + : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); + const struct cpumask *sched_trace_rd_span(struct root_domain *rd) { #ifdef CONFIG_SMP -- cgit