aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/ext.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r--kernel/sched/ext.c71
1 files changed, 47 insertions, 24 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3c4a94e4258f..7fff1d045477 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2751,7 +2751,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE;
- rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
+ rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
unlikely(rq->scx.cpu_released)) {
@@ -2762,7 +2762,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* emitted in switch_class().
*/
if (SCX_HAS_OP(cpu_acquire))
- SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL);
+ SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}
@@ -3065,12 +3065,11 @@ static struct task_struct *pick_task_scx(struct rq *rq)
{
struct task_struct *prev = rq->curr;
struct task_struct *p;
+ bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+ bool kick_idle = false;
/*
- * If balance_scx() is telling us to keep running @prev, replenish slice
- * if necessary and keep running @prev. Otherwise, pop the first one
- * from the local DSQ.
- *
* WORKAROUND:
*
* %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
@@ -3079,22 +3078,41 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* which then ends up calling pick_task_scx() without preceding
* balance_scx().
*
- * For now, ignore cases where $prev is not on SCX. This isn't great and
- * can theoretically lead to stalls. However, for switch_all cases, this
- * happens only while a BPF scheduler is being loaded or unloaded, and,
- * for partial cases, fair will likely keep triggering this CPU.
+ * Keep running @prev if possible and avoid stalling from entering idle
+ * without balancing.
*
- * Once fair is fixed, restore WARN_ON_ONCE().
+ * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
+ * if pick_task_scx() is called without preceding balance_scx().
+ */
+ if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
+ if (prev_on_scx) {
+ keep_prev = true;
+ } else {
+ keep_prev = false;
+ kick_idle = true;
+ }
+ } else if (unlikely(keep_prev && !prev_on_scx)) {
+ /* only allowed during transitions */
+ WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+ keep_prev = false;
+ }
+
+ /*
+ * If balance_scx() is telling us to keep running @prev, replenish slice
+ * if necessary and keep running @prev. Otherwise, pop the first one
+ * from the local DSQ.
*/
- if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
- prev->sched_class == &ext_sched_class) {
+ if (keep_prev) {
p = prev;
if (!p->scx.slice)
p->scx.slice = SCX_SLICE_DFL;
} else {
p = first_local_task(rq);
- if (!p)
+ if (!p) {
+ if (kick_idle)
+ scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
return NULL;
+ }
if (unlikely(!p->scx.slice)) {
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
@@ -3908,12 +3926,7 @@ static void scx_ops_exit_task(struct task_struct *p)
void init_scx_entity(struct sched_ext_entity *scx)
{
- /*
- * init_idle() calls this function again after fork sequence is
- * complete. Don't touch ->tasks_node as it's already linked.
- */
- memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node));
-
+ memset(scx, 0, sizeof(*scx));
INIT_LIST_HEAD(&scx->dsq_list.node);
RB_CLEAR_NODE(&scx->dsq_priq);
scx->sticky_cpu = -1;
@@ -4616,14 +4629,14 @@ static const struct kset_uevent_ops scx_uevent_ops = {
* Used by sched_fork() and __setscheduler_prio() to pick the matching
* sched_class. dl/rt are already handled.
*/
-bool task_should_scx(struct task_struct *p)
+bool task_should_scx(int policy)
{
if (!scx_enabled() ||
unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
return false;
if (READ_ONCE(scx_switching_all))
return true;
- return p->policy == SCHED_EXT;
+ return policy == SCHED_EXT;
}
/**
@@ -4902,11 +4915,16 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
const struct sched_class *old_class = p->sched_class;
+ const struct sched_class *new_class =
+ __setscheduler_class(p->policy, p->prio);
struct sched_enq_and_set_ctx ctx;
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- __setscheduler_prio(p, p->prio);
+ p->sched_class = new_class;
check_class_changing(task_rq(p), p, old_class);
sched_enq_and_set_task(&ctx);
@@ -5615,12 +5633,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
const struct sched_class *old_class = p->sched_class;
+ const struct sched_class *new_class =
+ __setscheduler_class(p->policy, p->prio);
struct sched_enq_and_set_ctx ctx;
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
p->scx.slice = SCX_SLICE_DFL;
- __setscheduler_prio(p, p->prio);
+ p->sched_class = new_class;
check_class_changing(task_rq(p), p, old_class);
sched_enq_and_set_task(&ctx);