diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 63 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/ext.c | 233 | ||||
-rw-r--r-- | kernel/sched/ext.h | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 52 | ||||
-rw-r--r-- | kernel/sched/sched.h | 36 | ||||
-rw-r--r-- | kernel/sched/stats.h | 48 | ||||
-rw-r--r-- | kernel/sched/syscalls.c | 13 |
8 files changed, 272 insertions, 177 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aeb595514461..719e0ed1e976 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * + * Additionally it is possible to be ->on_rq but still be considered not + * runnable when p->se.sched_delayed is true. These tasks are on the runqueue + * but will be dequeued as soon as they get picked again. See the + * task_is_runnable() helper. + * * p->on_cpu <- { 0, 1 }: * * is set by prepare_task() and cleared by finish_task() such that it will be @@ -2012,11 +2017,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) { - sched_info_enqueue(rq, p); - psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); - } - p->sched_class->enqueue_task(rq, p, flags); /* * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear @@ -2024,6 +2024,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p); + if (!(flags & ENQUEUE_RESTORE)) { + sched_info_enqueue(rq, p); + psi_enqueue(p, flags & ENQUEUE_MIGRATED); + } + if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); } @@ -2041,7 +2046,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeue(rq, p); - psi_dequeue(p, flags & DEQUEUE_SLEEP); + psi_dequeue(p, !(flags & DEQUEUE_SLEEP)); } /* @@ -4323,9 +4328,10 @@ static bool __task_needs_rq_lock(struct task_struct *p) * @arg: Argument to function. * * Fix the task in it's current state by avoiding wakeups and or rq operations - * and call @func(@arg) on it. This function can use ->on_rq and task_curr() - * to work out what the state is, if required. Given that @func can be invoked - * with a runqueue lock held, it had better be quite lightweight. + * and call @func(@arg) on it. This function can use task_is_runnable() and + * task_curr() to work out what the state is, if required. Given that @func + * can be invoked with a runqueue lock held, it had better be quite + * lightweight. * * Returns: * Whatever @func returns @@ -4705,7 +4711,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; #ifdef CONFIG_SCHED_CLASS_EXT - } else if (task_should_scx(p)) { + } else if (task_should_scx(p->policy)) { p->sched_class = &ext_sched_class; #endif } else { @@ -6544,6 +6550,7 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; + bool block = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6629,6 +6636,7 @@ static void __sched notrace __schedule(int sched_mode) * After this, schedule() must not care about p->state any more. */ block_task(rq, prev, flags); + block = true; } switch_count = &prev->nvcsw; } @@ -6674,7 +6682,7 @@ picked: migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); - psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + psi_sched_switch(prev, next, block); trace_sched_switch(preempt, prev, next, prev_state); @@ -7017,20 +7025,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -void __setscheduler_prio(struct task_struct *p, int prio) +const struct sched_class *__setscheduler_class(int policy, int prio) { if (dl_prio(prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; + return &dl_sched_class; + + if (rt_prio(prio)) + return &rt_sched_class; + #ifdef CONFIG_SCHED_CLASS_EXT - else if (task_should_scx(p)) - p->sched_class = &ext_sched_class; + if (task_should_scx(policy)) + return &ext_sched_class; #endif - else - p->sched_class = &fair_sched_class; - p->prio = prio; + return &fair_sched_class; } #ifdef CONFIG_RT_MUTEXES @@ -7076,7 +7084,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct rq_flags rf; struct rq *rq; @@ -7134,6 +7142,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; + next_class = __setscheduler_class(p->policy, prio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -7171,7 +7184,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->rt.timeout = 0; } - __setscheduler_prio(p, prio); + p->sched_class = next_class; + p->prio = prio; + check_class_changing(rq, p, prev_class); if (queued) @@ -10465,7 +10480,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) return; - task_work_add(curr, work, TWA_RESUME); + + /* No page allocation under rq lock */ + task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); } void sched_mm_cid_exit_signals(struct task_struct *t) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ce93d0bf452..be1b917dc8ce 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2385,7 +2385,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) deadline_queue_push_tasks(rq); - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, &p->dl); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 410a4df8a121..b5f4b1a5ae98 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,7 +9,6 @@ #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { - SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4, SCX_DSP_DFL_MAX_BATCH = 32, SCX_DSP_MAX_LOOPS = 32, SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, @@ -19,6 +18,12 @@ enum scx_consts { SCX_EXIT_DUMP_DFL_LEN = 32768, SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_OPS_TASK_ITER_BATCH = 32, }; enum scx_exit_kind { @@ -857,7 +862,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); -static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); +static int scx_ops_bypass_depth; +static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock); static bool scx_ops_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -1274,86 +1280,104 @@ struct scx_task_iter { struct task_struct *locked; struct rq *rq; struct rq_flags rf; + u32 cnt; }; /** - * scx_task_iter_init - Initialize a task iterator + * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration * @iter: iterator to init * - * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, - * @iter must eventually be exited with scx_task_iter_exit(). + * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter + * must eventually be stopped with scx_task_iter_stop(). * - * scx_tasks_lock may be released between this and the first next() call or - * between any two next() calls. If scx_tasks_lock is released between two - * next() calls, the caller is responsible for ensuring that the task being - * iterated remains accessible either through RCU read lock or obtaining a - * reference count. + * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() + * between this and the first next() call or between any two next() calls. If + * the locks are released between two next() calls, the caller is responsible + * for ensuring that the task being iterated remains accessible either through + * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be * visited as long as they still exist. */ -static void scx_task_iter_init(struct scx_task_iter *iter) +static void scx_task_iter_start(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + spin_lock_irq(&scx_tasks_lock); + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); iter->locked = NULL; + iter->cnt = 0; +} + +static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) +{ + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } } /** - * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator - * @iter: iterator to unlock rq for + * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator + * @iter: iterator to unlock * * If @iter is in the middle of a locked iteration, it may be locking the rq of - * the task currently being visited. Unlock the rq if so. This function can be - * safely called anytime during an iteration. + * the task currently being visited in addition to scx_tasks_lock. Unlock both. + * This function can be safely called anytime during an iteration. + */ +static void scx_task_iter_unlock(struct scx_task_iter *iter) +{ + __scx_task_iter_rq_unlock(iter); + spin_unlock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() + * @iter: iterator to re-lock * - * Returns %true if the rq @iter was locking is unlocked. %false if @iter was - * not locking an rq. + * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it + * doesn't re-lock the rq lock. Must be called before other iterator operations. */ -static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) +static void scx_task_iter_relock(struct scx_task_iter *iter) { - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; - return true; - } else { - return false; - } + spin_lock_irq(&scx_tasks_lock); } /** - * scx_task_iter_exit - Exit a task iterator + * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock * @iter: iterator to exit * - * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. - * If the iterator holds a task's rq lock, that rq lock is released. See - * scx_task_iter_init() for details. + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held + * which is released on return. If the iterator holds a task's rq lock, that rq + * lock is also released. See scx_task_iter_start() for details. */ -static void scx_task_iter_exit(struct scx_task_iter *iter) +static void scx_task_iter_stop(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - - scx_task_iter_rq_unlock(iter); list_del_init(&iter->cursor.tasks_node); + scx_task_iter_unlock(iter); } /** * scx_task_iter_next - Next task * @iter: iterator to walk * - * Visit the next task. See scx_task_iter_init() for details. + * Visit the next task. See scx_task_iter_start() for details. Locks are dropped + * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing + * stalls by holding scx_tasks_lock for too long. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - lockdep_assert_held(&scx_tasks_lock); + if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { + scx_task_iter_unlock(iter); + cond_resched(); + scx_task_iter_relock(iter); + } list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) @@ -1374,14 +1398,14 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) * @include_dead: Whether we should include dead tasks in the iteration * * Visit the non-idle task with its rq lock held. Allows callers to specify - * whether they would like to filter out dead tasks. See scx_task_iter_init() + * whether they would like to filter out dead tasks. See scx_task_iter_start() * for details. */ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) { struct task_struct *p; - scx_task_iter_rq_unlock(iter); + __scx_task_iter_rq_unlock(iter); while ((p = scx_task_iter_next(iter))) { /* @@ -1949,7 +1973,6 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { - bool bypassing = scx_rq_bypassing(rq); struct task_struct **ddsp_taskp; unsigned long qseq; @@ -1967,7 +1990,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (bypassing) + if (scx_rq_bypassing(rq)) goto global; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2022,7 +2045,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ - p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL; + p->scx.slice = SCX_SLICE_DFL; dispatch_enqueue(find_global_dsq(p), p, enq_flags); } @@ -2958,8 +2981,8 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (unlikely(!p->scx.slice)) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { - printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", - p->comm, p->pid); + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", + p->comm, p->pid, __func__); scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; @@ -3064,11 +3087,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, *found = false; - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return prev_cpu; - } - /* * If WAKE_SYNC, the waker's local DSQ is empty, and the system is * under utilized, wake up @p to the local DSQ of the waker. Checking @@ -3133,7 +3151,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu)) { + if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3198,7 +3216,7 @@ void __scx_update_idle(struct rq *rq, bool idle) { int cpu = cpu_of(rq); - if (SCX_HAS_OP(update_idle)) { + if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); if (!static_branch_unlikely(&scx_builtin_idle_enabled)) return; @@ -4239,14 +4257,14 @@ static const struct kset_uevent_ops scx_uevent_ops = { * Used by sched_fork() and __setscheduler_prio() to pick the matching * sched_class. dl/rt are already handled. */ -bool task_should_scx(struct task_struct *p) +bool task_should_scx(int policy) { if (!scx_enabled() || unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) return false; if (READ_ONCE(scx_switching_all)) return true; - return p->policy == SCHED_EXT; + return policy == SCHED_EXT; } /** @@ -4261,36 +4279,40 @@ bool task_should_scx(struct task_struct *p) * the DISABLING state and then cycling the queued tasks through dequeue/enqueue * to force global FIFO scheduling. * - * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. - * %SCX_OPS_ENQ_LAST is also ignored. + * - ops.select_cpu() is ignored and the default select_cpu() is used. * - * b. ops.dispatch() is ignored. + * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * %SCX_OPS_ENQ_LAST is also ignored. * - * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice - * can't be trusted. Whenever a tick triggers, the running task is rotated to - * the tail of the queue with core_sched_at touched. + * - ops.dispatch() is ignored. * - * d. pick_next_task() suppresses zero slice warning. + * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice + * can't be trusted. Whenever a tick triggers, the running task is rotated to + * the tail of the queue with core_sched_at touched. * - * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM - * operations. + * - pick_next_task() suppresses zero slice warning. * - * f. scx_prio_less() reverts to the default core_sched_at order. + * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. + * + * - scx_prio_less() reverts to the default core_sched_at order. */ static void scx_ops_bypass(bool bypass) { - int depth, cpu; + int cpu; + unsigned long flags; + raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags); if (bypass) { - depth = atomic_inc_return(&scx_ops_bypass_depth); - WARN_ON_ONCE(depth <= 0); - if (depth != 1) - return; + scx_ops_bypass_depth++; + WARN_ON_ONCE(scx_ops_bypass_depth <= 0); + if (scx_ops_bypass_depth != 1) + goto unlock; } else { - depth = atomic_dec_return(&scx_ops_bypass_depth); - WARN_ON_ONCE(depth < 0); - if (depth != 0) - return; + scx_ops_bypass_depth--; + WARN_ON_ONCE(scx_ops_bypass_depth < 0); + if (scx_ops_bypass_depth != 0) + goto unlock; } /* @@ -4307,7 +4329,7 @@ static void scx_ops_bypass(bool bypass) struct rq_flags rf; struct task_struct *p, *n; - rq_lock_irqsave(rq, &rf); + rq_lock(rq, &rf); if (bypass) { WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); @@ -4343,11 +4365,13 @@ static void scx_ops_bypass(bool bypass) sched_enq_and_set_task(&ctx); } - rq_unlock_irqrestore(rq, &rf); + rq_unlock(rq, &rf); - /* kick to restore ticks */ + /* resched to restore ticks and idle state */ resched_cpu(cpu); } +unlock: + raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags); } static void free_exit_info(struct scx_exit_info *ei) @@ -4467,16 +4491,19 @@ static void scx_ops_disable_workfn(struct kthread_work *work) scx_ops_init_task_enabled = false; - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); - __setscheduler_prio(p, p->prio); + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); @@ -4484,8 +4511,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) check_class_changed(task_rq(p), p, old_class, p->prio); scx_ops_exit_task(p); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); /* no task is on scx, turn off all the switches and flush in-progress calls */ @@ -5136,8 +5162,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) goto err_disable_unlock_all; - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { /* * @p may already be dead, have lost all its usages counts and @@ -5147,15 +5172,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!tryget_task_struct(p)) continue; - scx_task_iter_rq_unlock(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_unlock(&sti); ret = scx_ops_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_relock(&sti); + scx_task_iter_stop(&sti); scx_ops_error("ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); goto err_disable_unlock_all; @@ -5164,10 +5187,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); + scx_task_iter_relock(&sti); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); @@ -5184,23 +5206,27 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * scx_tasks_lock. */ percpu_down_write(&scx_fork_rwsem); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - __setscheduler_prio(p, p->prio); + p->scx.slice = SCX_SLICE_DFL; + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); check_class_changed(task_rq(p), p, old_class, p->prio); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); scx_ops_bypass(false); @@ -5872,16 +5898,21 @@ __bpf_kfunc_start_defs(); __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { - *is_idle = false; - return prev_cpu; + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + goto prev_cpu; } + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + #ifdef CONFIG_SMP return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#else +#endif + +prev_cpu: *is_idle = false; return prev_cpu; -#endif } __bpf_kfunc_end_defs(); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 246019519231..b1675bb59fc4 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -18,7 +18,7 @@ bool scx_can_stop_tick(struct rq *rq); void scx_rq_activate(struct rq *rq); void scx_rq_deactivate(struct rq *rq); int scx_check_setscheduler(struct task_struct *p, int policy); -bool task_should_scx(struct task_struct *p); +bool task_should_scx(int policy); void init_sched_ext_class(void); static inline u32 scx_cpuperf_target(s32 cpu) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 225b31aaee55..2d16c8545c71 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1247,7 +1247,7 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); - if (rq->nr_running == 1) + if (cfs_rq->nr_running == 1) return; if (resched || did_preempt_short(cfs_rq, curr)) { @@ -3369,7 +3369,7 @@ retry_pids: vma = vma_next(&vmi); } - do { + for (; vma; vma = vma_next(&vmi)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); @@ -3491,7 +3491,7 @@ retry_pids: */ if (vma_pids_forced) break; - } for_each_vma(vmi, vma); + } /* * If no VMAs are remaining and VMAs were skipped due to the PID @@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) struct sched_entity *se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - SCHED_WARN_ON(se->sched_delayed); - SCHED_WARN_ON(se->on_rq); + /* + * Must not reference @se again, see __block_task(). + */ return NULL; } return se; @@ -6058,10 +6059,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (se->on_rq) { - SCHED_WARN_ON(se->sched_delayed); + /* Handle any unfinished DELAY_DEQUEUE business first. */ + if (se->sched_delayed) { + int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; + + dequeue_entity(qcfs_rq, se, flags); + } else if (se->on_rq) break; - } enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) @@ -7173,7 +7177,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) /* Fix-up what dequeue_task_fair() skipped */ hrtick_update(rq); - /* Fix-up what block_task() skipped. */ + /* + * Fix-up what block_task() skipped. + * + * Must be last, @p might not be valid after this. + */ __block_task(rq, p); } @@ -7190,12 +7198,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) util_est_dequeue(&rq->cfs, p); - if (dequeue_entities(rq, &p->se, flags) < 0) { - util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + if (dequeue_entities(rq, &p->se, flags) < 0) return false; - } - util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + /* + * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). + */ + hrtick_update(rq); return true; } @@ -13174,22 +13184,6 @@ static void attach_task_cfs_rq(struct task_struct *p) static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); - /* - * Since this is called after changing class, this is a little weird - * and we cannot use DEQUEUE_DELAYED. - */ - if (p->se.sched_delayed) { - /* First, dequeue it from its new class' structures */ - dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); - /* - * Now, clean up the fair_sched_class side of things - * related to sched_delayed being true and that wasn't done - * due to the generic dequeue not using DEQUEUE_DELAYED. - */ - finish_delayed_dequeue_entity(&p->se); - p->se.rel_deadline = 0; - __block_task(rq, p); - } } static void switched_to_fair(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6085ef50febf..6c54a57275cc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2769,8 +2769,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) static inline void __block_task(struct rq *rq, struct task_struct *p) { - WRITE_ONCE(p->on_rq, 0); - ASSERT_EXCLUSIVE_WRITER(p->on_rq); if (p->sched_contributes_to_load) rq->nr_uninterruptible++; @@ -2778,6 +2776,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p) atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); } + + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + + /* + * The moment this write goes through, ttwu() can swoop in and migrate + * this task, rendering our rq->__lock ineffective. + * + * __schedule() try_to_wake_up() + * LOCK rq->__lock LOCK p->pi_lock + * pick_next_task() + * pick_next_task_fair() + * pick_next_entity() + * dequeue_entities() + * __block_task() + * RELEASE p->on_rq = 0 if (p->on_rq && ...) + * break; + * + * ACQUIRE (after ctrl-dep) + * + * cpu = select_task_rq(); + * set_task_cpu(p, cpu); + * ttwu_queue() + * ttwu_do_activate() + * LOCK rq->__lock + * activate_task() + * STORE p->on_rq = 1 + * UNLOCK rq->__lock + * + * Callers must ensure to not reference @p after this -- we no longer + * own it. + */ + smp_store_release(&p->on_rq, 0); } extern void activate_task(struct rq *rq, struct task_struct *p, int flags); @@ -3800,7 +3830,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); -extern void __setscheduler_prio(struct task_struct *p, int prio); +extern const struct sched_class *__setscheduler_class(int policy, int prio); extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 237780aa3c53..767e098a3bd1 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -119,45 +119,63 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, - * where a task's runnable state changes, and requeues, where a task - * and its state are being moved between CPUs and runqueues. + * where a task's runnable state changes, and migrations, where a task + * and its runnable state are being moved between CPUs and runqueues. + * + * A notable case is a task whose dequeue is delayed. PSI considers + * those sleeping, but because they are still on the runqueue they can + * go through migration requeues. In this case, *sleeping* states need + * to be transferred. */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) +static inline void psi_enqueue(struct task_struct *p, bool migrate) { - int clear = 0, set = TSK_RUNNING; + int clear = 0, set = 0; if (static_branch_likely(&psi_disabled)) return; - if (p->in_memstall) - set |= TSK_MEMSTALL_RUNNING; - - if (!wakeup) { + if (p->se.sched_delayed) { + /* CPU migration of "sleeping" task */ + SCHED_WARN_ON(!migrate); if (p->in_memstall) set |= TSK_MEMSTALL; + if (p->in_iowait) + set |= TSK_IOWAIT; + } else if (migrate) { + /* CPU migration of runnable task */ + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; } else { + /* Wakeup of new or sleeping task */ if (p->in_iowait) clear |= TSK_IOWAIT; + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; } psi_task_change(p, clear, set); } -static inline void psi_dequeue(struct task_struct *p, bool sleep) +static inline void psi_dequeue(struct task_struct *p, bool migrate) { if (static_branch_likely(&psi_disabled)) return; /* + * When migrating a task to another CPU, clear all psi + * state. The enqueue callback above will work it out. + */ + if (migrate) + psi_task_change(p, p->psi_flags, 0); + + /* * A voluntary sleep is a dequeue followed by a task switch. To * avoid walking all ancestors twice, psi_task_switch() handles * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. */ - if (sleep) - return; - - psi_task_change(p, p->psi_flags, 0); } static inline void psi_ttwu_dequeue(struct task_struct *p) @@ -190,8 +208,8 @@ static inline void psi_sched_switch(struct task_struct *prev, } #else /* CONFIG_PSI */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} -static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_enqueue(struct task_struct *p, bool migrate) {} +static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index aa70beee9895..24f9f90b6574 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -529,7 +529,7 @@ int __sched_setscheduler(struct task_struct *p, { int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; int reset_on_fork; @@ -706,6 +706,12 @@ change: queue_flags &= ~DEQUEUE_MOVE; } + prev_class = p->sched_class; + next_class = __setscheduler_class(policy, newprio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -713,11 +719,10 @@ change: if (running) put_prev_task(rq, p); - prev_class = p->sched_class; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); + p->sched_class = next_class; + p->prio = newprio; } __setscheduler_uclamp(p, attr); check_class_changing(rq, p, prev_class); |