diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 192 |
1 files changed, 119 insertions, 73 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a708d225c28e..1a914388144a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -57,6 +57,7 @@ #include <linux/profile.h> #include <linux/psi.h> #include <linux/rcuwait_api.h> +#include <linux/rseq.h> #include <linux/sched/wake_q.h> #include <linux/scs.h> #include <linux/slab.h> @@ -107,7 +108,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); -EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); @@ -1131,6 +1132,28 @@ static void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; + /* + * Set TIF_NEED_RESCHED and send an IPI if in the non-polling + * part of the idle loop. This forces an exit from the idle loop + * and a round trip to schedule(). Now this could be optimized + * because a simple new idle loop iteration is enough to + * re-evaluate the next tick. Provided some re-ordering of tick + * nohz functions that would need to follow TIF_NR_POLLING + * clearing: + * + * - On most archs, a simple fetch_or on ti::flags with a + * "0" value would be enough to know if an IPI needs to be sent. + * + * - x86 needs to perform a last need_resched() check between + * monitor and mwait which doesn't take timers into account. + * There a dedicated TIF_TIMER flag would be required to + * fetch_or here and be checked along with TIF_NEED_RESCHED + * before mwait(). + * + * However, remote timer enqueue is not such a frequent event + * and testing of the above solutions didn't appear to report + * much benefits. + */ if (set_nr_and_not_polling(rq->idle)) smp_send_reschedule(cpu); else @@ -1769,7 +1792,6 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css); #endif #ifdef CONFIG_SYSCTL -#ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK_GROUP static void uclamp_update_root_tg(void) { @@ -1875,7 +1897,6 @@ undo: return result; } #endif -#endif static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) @@ -2042,7 +2063,7 @@ static void __init init_uclamp(void) } } -#else /* CONFIG_UCLAMP_TASK */ +#else /* !CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline int uclamp_validate(struct task_struct *p, @@ -2124,12 +2145,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) enqueue_task(rq, p, flags); - p->on_rq = TASK_ON_RQ_QUEUED; + WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; + WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); dequeue_task(rq, p, flags); } @@ -3795,6 +3818,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } #endif + + p->dl_server = NULL; } /* @@ -3928,6 +3953,17 @@ void wake_up_if_idle(int cpu) } } +bool cpus_equal_capacity(int this_cpu, int that_cpu) +{ + if (!sched_asym_cpucap_active()) + return true; + + if (this_cpu == that_cpu) + return true; + + return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu); +} + bool cpus_share_cache(int this_cpu, int that_cpu) { if (this_cpu == that_cpu) @@ -4509,10 +4545,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) memset(&p->stats, 0, sizeof(p->stats)); #endif - RB_CLEAR_NODE(&p->dl.rb_node); - init_dl_task_timer(&p->dl); - init_dl_inactive_task_timer(&p->dl); - __dl_clear_params(p); + init_dl_entity(&p->dl); INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout = 0; @@ -5629,13 +5662,13 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ -void scheduler_tick(void) +void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; - unsigned long thermal_pressure; + unsigned long hw_pressure; u64 resched_latency; if (housekeeping_cpu(cpu, HK_TYPE_TICK)) @@ -5646,8 +5679,8 @@ void scheduler_tick(void) rq_lock(rq, &rf); update_rq_clock(rq); - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); curr->sched_class->task_tick(rq, curr, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); @@ -5667,7 +5700,7 @@ void scheduler_tick(void) #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq); + sched_balance_trigger(rq); #endif } @@ -6004,12 +6037,27 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = pick_next_task_idle(rq); } + /* + * This is the fast path; it cannot be a DL server pick; + * therefore even if @p == @prev, ->dl_server must be NULL. + */ + if (p->dl_server) + p->dl_server = NULL; + return p; } restart: put_prev_task_balance(rq, prev, rf); + /* + * We've updated @prev and no longer need the server link, clear it. + * Must be done before ->pick_next_task() because that can (re)set + * ->dl_server. + */ + if (prev->dl_server) + prev->dl_server = NULL; + for_each_class(class) { p = class->pick_next_task(rq); if (p) @@ -6537,7 +6585,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * paths. For example, see arch/x86/entry_64.S. * * To drive preemption between tasks, the scheduler sets the flag in timer - * interrupt handler scheduler_tick(). + * interrupt handler sched_tick(). * * 3. Wakeups don't really cause entry into schedule(). They add a * task to the run-queue and that's it. @@ -6599,7 +6647,9 @@ static void __sched notrace __schedule(unsigned int sched_mode) * if (signal_pending_state()) if (p->state & @state) * * Also, the membarrier system call requires a full memory barrier - * after coming from user-space, before storing to rq->curr. + * after coming from user-space, before storing to rq->curr; this + * barrier matches a full barrier in the proximity of the membarrier + * system call exit. */ rq_lock(rq, &rf); smp_mb__after_spinlock(); @@ -6670,12 +6720,20 @@ static void __sched notrace __schedule(unsigned int sched_mode) * * Here are the schemes providing that barrier on the * various architectures: - * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. - * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC, + * RISC-V. switch_mm() relies on membarrier_arch_switch_mm() + * on PowerPC and on RISC-V. * - finish_lock_switch() for weakly-ordered * architectures where spin_unlock is a full barrier, * - switch_to() for arm64 (weakly-ordered, spin_unlock * is a RELEASE barrier), + * + * The barrier matches a full barrier in the proximity of + * the membarrier system call entry. + * + * On RISC-V, this barrier pairing is also needed for the + * SYNC_CORE command when switching between processes, cf. + * the inline comments in membarrier_arch_switch_mm(). */ ++*switch_count; @@ -6748,10 +6806,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { + if (tsk->flags & PF_BLOCK_TS) + blk_plug_invalidate_ts(tsk); if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); - else + else if (tsk->flags & PF_IO_WORKER) io_wq_worker_running(tsk); } } @@ -7429,18 +7489,13 @@ int sched_core_idle_cpu(int cpu) * required to meet deadlines. */ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - enum cpu_util_type type, - struct task_struct *p) + unsigned long *min, + unsigned long *max) { - unsigned long dl_util, util, irq, max; + unsigned long util, irq, scale; struct rq *rq = cpu_rq(cpu); - max = arch_scale_cpu_capacity(cpu); - - if (!uclamp_is_used() && - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { - return max; - } + scale = arch_scale_cpu_capacity(cpu); /* * Early check to see if IRQ/steal time saturates the CPU, can be @@ -7448,45 +7503,49 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, * update_irq_load_avg(). */ irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; + if (unlikely(irq >= scale)) { + if (min) + *min = scale; + if (max) + *max = scale; + return scale; + } + + if (min) { + /* + * The minimum utilization returns the highest level between: + * - the computed DL bandwidth needed with the IRQ pressure which + * steals time to the deadline task. + * - The minimum performance requirement for CFS and/or RT. + */ + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); + + /* + * When an RT task is runnable and uclamp is not used, we must + * ensure that the task will run at maximum compute capacity. + */ + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) + *min = max(*min, scale); + } /* * Because the time spend on RT/DL tasks is visible as 'lost' time to * CFS tasks and we use the same metric to track the effective * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. - * - * CFS and RT utilization can be boosted or capped, depending on - * utilization clamp constraints requested by currently RUNNABLE - * tasks. - * When there are no CFS RUNNABLE tasks, clamps are released and - * frequency will be gracefully reduced with the utilization decay. */ util = util_cfs + cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) - util = uclamp_rq_util_with(rq, util, p); - - dl_util = cpu_util_dl(rq); + util += cpu_util_dl(rq); /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. + * The maximum hint is a soft bandwidth requirement, which can be lower + * than the actual utilization because of uclamp_max requirements. */ - if (util + dl_util >= max) - return max; + if (max) + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; + if (util >= scale) + return scale; /* * There is still idle time; further improve the number by using the @@ -7497,28 +7556,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, * U' = irq + --------- * U * max */ - util = scale_irq_capacity(util, irq, max); + util = scale_irq_capacity(util, irq, scale); util += irq; - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); + return min(scale, util); } unsigned long sched_cpu_util(int cpu) { - return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL); + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); } #endif /* CONFIG_SMP */ |