diff options
-rw-r--r-- | MAINTAINERS | 1 | ||||
-rw-r--r-- | init/Kconfig | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 39 | ||||
-rw-r--r-- | kernel/sched/fair.c | 62 | ||||
-rw-r--r-- | kernel/sched/features.h | 2 | ||||
-rw-r--r-- | kernel/sched/sched.h | 4 | ||||
-rw-r--r-- | kernel/sched/topology.c | 3 |
7 files changed, 98 insertions, 14 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index aee340630eca..f017dc6ce7ab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17047,6 +17047,7 @@ F: drivers/net/ppp/pptp.c PRESSURE STALL INFORMATION (PSI) M: Johannes Weiner <[email protected]> M: Suren Baghdasaryan <[email protected]> +R: Peter Ziljstra <[email protected]> S: Maintained F: include/linux/psi* F: kernel/sched/psi.c diff --git a/init/Kconfig b/init/Kconfig index f7f65af4ee12..5e7d4885d1bf 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -629,6 +629,7 @@ config TASK_IO_ACCOUNTING config PSI bool "Pressure stall information tracking" + select KERNFS help Collect metrics that indicate how overcommitted the CPU, memory, and IO capacity are in the system. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a5d3422f7d0d..a97eab3e775a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1194,6 +1194,20 @@ static void nohz_csd_func(void *info) #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL +static inline bool __need_bw_check(struct rq *rq, struct task_struct *p) +{ + if (rq->nr_running != 1) + return false; + + if (p->sched_class != &fair_sched_class) + return false; + + if (!task_on_rq_queued(p)) + return false; + + return true; +} + bool sched_can_stop_tick(struct rq *rq) { int fifo_nr_running; @@ -1229,6 +1243,18 @@ bool sched_can_stop_tick(struct rq *rq) if (rq->nr_running > 1) return false; + /* + * If there is one task and it has CFS runtime bandwidth constraints + * and it's on the cpu now we don't want to stop the tick. + * This check prevents clearing the bit if a newly enqueued task here is + * dequeued by migrating while the constrained task continues to run. + * E.g. going from 2->1 without going through pick_next_task(). + */ + if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { + if (cfs_task_bw_constrained(rq->curr)) + return false; + } + return true; } #endif /* CONFIG_NO_HZ_FULL */ @@ -9955,7 +9981,7 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); + init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -11089,11 +11115,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) /* * Ensure max(child_quota) <= parent_quota. On cgroup2, - * always take the min. On cgroup1, only inherit when no - * limit is set: + * always take the non-RUNTIME_INF min. On cgroup1, only + * inherit when no limit is set. In both cases this is used + * by the scheduler to determine if a given CFS task has a + * bandwidth constraint at some higher level. */ if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { - quota = min(quota, parent_quota); + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF) + quota = min(quota, parent_quota); } else { if (quota == RUNTIME_INF) quota = parent_quota; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c8c9f7d8496..f496cef90ce7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6110,13 +6110,14 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) { raw_spin_lock_init(&cfs_b->lock); cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); cfs_b->burst = 0; + cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); @@ -6253,6 +6254,46 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) rq_clock_stop_loop_update(rq); } +bool cfs_task_bw_constrained(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + if (!cfs_bandwidth_used()) + return false; + + if (cfs_rq->runtime_enabled || + tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF) + return true; + + return false; +} + +#ifdef CONFIG_NO_HZ_FULL +/* called from pick_next_task_fair() */ +static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) +{ + int cpu = cpu_of(rq); + + if (!sched_feat(HZ_BW) || !cfs_bandwidth_used()) + return; + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (rq->nr_running != 1) + return; + + /* + * We know there is only one task runnable and we've just picked it. The + * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will + * be otherwise able to stop the tick. Just need to check if we are using + * bandwidth control. + */ + if (cfs_task_bw_constrained(p)) + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#endif + #else /* CONFIG_CFS_BANDWIDTH */ static inline bool cfs_bandwidth_used(void) @@ -6282,7 +6323,7 @@ static inline int throttled_lb_pair(struct task_group *tg, return 0; } -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} #ifdef CONFIG_FAIR_GROUP_SCHED static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -6295,9 +6336,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} - +#ifdef CONFIG_CGROUP_SCHED +bool cfs_task_bw_constrained(struct task_struct *p) +{ + return false; +} +#endif #endif /* CONFIG_CFS_BANDWIDTH */ +#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) +static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} +#endif + /************************************************** * CFS operations on tasks: */ @@ -7372,9 +7422,6 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); - if (boost) - util_est = max(util_est, runnable); - /* * During wake-up @p isn't enqueued yet and doesn't contribute * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. @@ -8160,6 +8207,7 @@ done: __maybe_unused; hrtick_start_fair(rq, p); update_misfit_status(p, rq); + sched_fair_update_stop_tick(rq, p); return p; @@ -12499,7 +12547,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 54334ca5c5c6..61bcbf5e46a4 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -86,3 +86,5 @@ SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(UTIL_EST_FASTUP, true) SCHED_FEAT(LATENCY_WARN, false) + +SCHED_FEAT(HZ_BW, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index db5853761b1f..9c5035ca3b06 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -454,11 +454,12 @@ extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); -extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); +extern bool cfs_task_bw_constrained(struct task_struct *p); extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, @@ -494,6 +495,7 @@ static inline void set_task_rq_fair(struct sched_entity *se, #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; +static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 7cfcfe5d27b9..05a5bc678c08 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -722,8 +722,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (parent->parent) { parent->parent->child = tmp; - if (tmp->flags & SD_SHARE_CPUCAPACITY) - parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY; + parent->parent->groups->flags = tmp->flags; } /* |