aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS1
-rw-r--r--init/Kconfig1
-rw-r--r--kernel/sched/core.c39
-rw-r--r--kernel/sched/fair.c62
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/sched/topology.c3
7 files changed, 98 insertions, 14 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index aee340630eca..f017dc6ce7ab 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17047,6 +17047,7 @@ F: drivers/net/ppp/pptp.c
PRESSURE STALL INFORMATION (PSI)
M: Johannes Weiner <[email protected]>
M: Suren Baghdasaryan <[email protected]>
+R: Peter Ziljstra <[email protected]>
S: Maintained
F: include/linux/psi*
F: kernel/sched/psi.c
diff --git a/init/Kconfig b/init/Kconfig
index f7f65af4ee12..5e7d4885d1bf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -629,6 +629,7 @@ config TASK_IO_ACCOUNTING
config PSI
bool "Pressure stall information tracking"
+ select KERNFS
help
Collect metrics that indicate how overcommitted the CPU, memory,
and IO capacity are in the system.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a5d3422f7d0d..a97eab3e775a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1194,6 +1194,20 @@ static void nohz_csd_func(void *info)
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
+static inline bool __need_bw_check(struct rq *rq, struct task_struct *p)
+{
+ if (rq->nr_running != 1)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ if (!task_on_rq_queued(p))
+ return false;
+
+ return true;
+}
+
bool sched_can_stop_tick(struct rq *rq)
{
int fifo_nr_running;
@@ -1229,6 +1243,18 @@ bool sched_can_stop_tick(struct rq *rq)
if (rq->nr_running > 1)
return false;
+ /*
+ * If there is one task and it has CFS runtime bandwidth constraints
+ * and it's on the cpu now we don't want to stop the tick.
+ * This check prevents clearing the bit if a newly enqueued task here is
+ * dequeued by migrating while the constrained task continues to run.
+ * E.g. going from 2->1 without going through pick_next_task().
+ */
+ if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
+ if (cfs_task_bw_constrained(rq->curr))
+ return false;
+ }
+
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -9955,7 +9981,7 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -11089,11 +11115,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
/*
* Ensure max(child_quota) <= parent_quota. On cgroup2,
- * always take the min. On cgroup1, only inherit when no
- * limit is set:
+ * always take the non-RUNTIME_INF min. On cgroup1, only
+ * inherit when no limit is set. In both cases this is used
+ * by the scheduler to determine if a given CFS task has a
+ * bandwidth constraint at some higher level.
*/
if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
- quota = min(quota, parent_quota);
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF)
+ quota = min(quota, parent_quota);
} else {
if (quota == RUNTIME_INF)
quota = parent_quota;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c8c9f7d8496..f496cef90ce7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6110,13 +6110,14 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
cfs_b->burst = 0;
+ cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
@@ -6253,6 +6254,46 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
rq_clock_stop_loop_update(rq);
}
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ if (!cfs_bandwidth_used())
+ return false;
+
+ if (cfs_rq->runtime_enabled ||
+ tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/* called from pick_next_task_fair() */
+static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq);
+
+ if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
+ return;
+
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ if (rq->nr_running != 1)
+ return;
+
+ /*
+ * We know there is only one task runnable and we've just picked it. The
+ * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
+ * be otherwise able to stop the tick. Just need to check if we are using
+ * bandwidth control.
+ */
+ if (cfs_task_bw_constrained(p))
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#endif
+
#else /* CONFIG_CFS_BANDWIDTH */
static inline bool cfs_bandwidth_used(void)
@@ -6282,7 +6323,7 @@ static inline int throttled_lb_pair(struct task_group *tg,
return 0;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -6295,9 +6336,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
-
+#ifdef CONFIG_CGROUP_SCHED
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ return false;
+}
+#endif
#endif /* CONFIG_CFS_BANDWIDTH */
+#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
+static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
+#endif
+
/**************************************************
* CFS operations on tasks:
*/
@@ -7372,9 +7422,6 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
- if (boost)
- util_est = max(util_est, runnable);
-
/*
* During wake-up @p isn't enqueued yet and doesn't contribute
* to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
@@ -8160,6 +8207,7 @@ done: __maybe_unused;
hrtick_start_fair(rq, p);
update_misfit_status(p, rq);
+ sched_fair_update_stop_tick(rq, p);
return p;
@@ -12499,7 +12547,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
tg->shares = NICE_0_LOAD;
- init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
for_each_possible_cpu(i) {
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 54334ca5c5c6..61bcbf5e46a4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -86,3 +86,5 @@ SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)
+
+SCHED_FEAT(HZ_BW, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db5853761b1f..9c5035ca3b06 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -454,11 +454,12 @@ extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
-extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern bool cfs_task_bw_constrained(struct task_struct *p);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
@@ -494,6 +495,7 @@ static inline void set_task_rq_fair(struct sched_entity *se,
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
+static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 7cfcfe5d27b9..05a5bc678c08 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -722,8 +722,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (parent->parent) {
parent->parent->child = tmp;
- if (tmp->flags & SD_SHARE_CPUCAPACITY)
- parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY;
+ parent->parent->groups->flags = tmp->flags;
}
/*