diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 668 |
1 files changed, 389 insertions, 279 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c86935a7f1f8..8b864ecee0e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1000,7 +1000,11 @@ inline int task_curr(const struct task_struct *p) } /* - * Can drop rq->lock because from sched_class::switched_from() methods drop it. + * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, + * use the balance_callback list if you want balancing. + * + * this means any call to check_class_changed() must be followed by a call to + * balance_callback(). */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, @@ -1009,7 +1013,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); - /* Possble rq->lock 'hole'. */ + p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); @@ -1041,6 +1045,222 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we invoke migration_cpu_stop() on the target CPU using + * stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + * off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 5) stopper completes and stop_one_cpu() returns and the migration + * is done. + */ + +/* + * move_queued_task - move a queued task to new rq. + * + * Returns (locked) new rq. Old rq's lock is released. + */ +static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) +{ + lockdep_assert_held(&rq->lock); + + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(new_cpu); + + raw_spin_lock(&rq->lock); + BUG_ON(task_cpu(p) != new_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + + return rq; +} + +struct migration_arg { + struct task_struct *task; + int dest_cpu; +}; + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) +{ + if (unlikely(!cpu_active(dest_cpu))) + return rq; + + /* Affinity changed (again). */ + if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return rq; + + rq = move_queued_task(rq, p, dest_cpu); + + return rq; +} + +/* + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue. + */ +static int migration_cpu_stop(void *data) +{ + struct migration_arg *arg = data; + struct task_struct *p = arg->task; + struct rq *rq = this_rq(); + + /* + * The original target cpu might have gone down and we might + * be on another cpu but it doesn't matter. + */ + local_irq_disable(); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_allowed + * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. + */ + sched_ttwu_pending(); + + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + /* + * If task_rq(p) != rq, it cannot be migrated here, because we're + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because + * we're holding p->pi_lock. + */ + if (task_rq(p) == rq && task_on_rq_queued(p)) + rq = __migrate_task(rq, p, arg->dest_cpu); + raw_spin_unlock(&rq->lock); + raw_spin_unlock(&p->pi_lock); + + local_irq_enable(); + return 0; +} + +/* + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. + */ +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) +{ + cpumask_copy(&p->cpus_allowed, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); +} + +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; + + lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + dequeue_task(rq, p, 0); + } + if (running) + put_prev_task(rq, p); + + p->sched_class->set_cpus_allowed(p, new_mask); + + if (running) + p->sched_class->set_curr_task(rq); + if (queued) + enqueue_task(rq, p, 0); +} + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + unsigned long flags; + struct rq *rq; + unsigned int dest_cpu; + int ret = 0; + + rq = task_rq_lock(p, &flags); + + /* + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + + if (cpumask_equal(&p->cpus_allowed, new_mask)) + goto out; + + if (!cpumask_intersects(new_mask, cpu_active_mask)) { + ret = -EINVAL; + goto out; + } + + do_set_cpus_allowed(p, new_mask); + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), new_mask)) + goto out; + + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (task_running(rq, p) || p->state == TASK_WAKING) { + struct migration_arg arg = { p, dest_cpu }; + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, p, &flags); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + return 0; + } else if (task_on_rq_queued(p)) { + /* + * OK, since we're going to drop the lock immediately + * afterwards anyway. + */ + lockdep_unpin_lock(&rq->lock); + rq = move_queued_task(rq, p, dest_cpu); + lockdep_pin_lock(&rq->lock); + } +out: + task_rq_unlock(rq, p, &flags); + + return ret; +} + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return __set_cpus_allowed_ptr(p, new_mask, false); +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -1181,13 +1401,6 @@ out: return ret; } -struct migration_arg { - struct task_struct *task; - int dest_cpu; -}; - -static int migration_cpu_stop(void *data); - /* * wait_task_inactive - wait for a thread to unschedule. * @@ -1320,9 +1533,7 @@ void kick_process(struct task_struct *p) preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); -#endif /* CONFIG_SMP */ -#ifdef CONFIG_SMP /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ @@ -1402,6 +1613,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { + lockdep_assert_held(&p->pi_lock); + if (p->nr_cpus_allowed > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); @@ -1427,7 +1640,16 @@ static void update_avg(u64 *avg, u64 sample) s64 diff = sample - *avg; *avg += diff >> 3; } -#endif + +#else + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + return set_cpus_allowed_ptr(p, new_mask); +} + +#endif /* CONFIG_SMP */ static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) @@ -1486,12 +1708,19 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { check_preempt_curr(rq, p, wake_flags); - trace_sched_wakeup(p, true); - p->state = TASK_RUNNING; + trace_sched_wakeup(p); + #ifdef CONFIG_SMP - if (p->sched_class->task_woken) + if (p->sched_class->task_woken) { + /* + * Our task @p is fully woken up and running; so its safe to + * drop the rq->lock, hereafter rq is only used for statistics. + */ + lockdep_unpin_lock(&rq->lock); p->sched_class->task_woken(rq, p); + lockdep_pin_lock(&rq->lock); + } if (rq->idle_stamp) { u64 delta = rq_clock(rq) - rq->idle_stamp; @@ -1510,6 +1739,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) { + lockdep_assert_held(&rq->lock); + #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; @@ -1554,6 +1785,7 @@ void sched_ttwu_pending(void) return; raw_spin_lock_irqsave(&rq->lock, flags); + lockdep_pin_lock(&rq->lock); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); @@ -1561,6 +1793,7 @@ void sched_ttwu_pending(void) ttwu_do_activate(rq, p, 0); } + lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1657,7 +1890,9 @@ static void ttwu_queue(struct task_struct *p, int cpu) #endif raw_spin_lock(&rq->lock); + lockdep_pin_lock(&rq->lock); ttwu_do_activate(rq, p, 0); + lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); } @@ -1693,6 +1928,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (!(p->state & state)) goto out; + trace_sched_waking(p); + success = 1; /* we're going to change ->state */ cpu = task_cpu(p); @@ -1752,14 +1989,24 @@ static void try_to_wake_up_local(struct task_struct *p) lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet picked a replacement task. + */ + lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); raw_spin_lock(&p->pi_lock); raw_spin_lock(&rq->lock); + lockdep_pin_lock(&rq->lock); } if (!(p->state & TASK_NORMAL)) goto out; + trace_sched_waking(p); + if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); @@ -1827,9 +2074,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; -#ifdef CONFIG_SMP - p->se.avg.decay_count = 0; -#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS @@ -1975,7 +2219,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) set_task_cpu(p, cpu); raw_spin_unlock_irqrestore(&p->pi_lock, flags); -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif @@ -2011,8 +2255,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -2021,8 +2265,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; @@ -2114,11 +2358,11 @@ void wake_up_new_task(struct task_struct *p) #endif /* Initialize new task's runnable average */ - init_task_runnable_average(p); + init_entity_runnable_average(&p->se); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; - trace_sched_wakeup_new(p, true); + trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) @@ -2131,13 +2375,27 @@ void wake_up_new_task(struct task_struct *p) static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; +void preempt_notifier_inc(void) +{ + static_key_slow_inc(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_inc); + +void preempt_notifier_dec(void) +{ + static_key_slow_dec(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_dec); + /** * preempt_notifier_register - tell me when current is being preempted & rescheduled * @notifier: notifier struct to register */ void preempt_notifier_register(struct preempt_notifier *notifier) { - static_key_slow_inc(&preempt_notifier_key); + if (!static_key_false(&preempt_notifier_key)) + WARN(1, "registering preempt_notifier while notifiers disabled\n"); + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); } EXPORT_SYMBOL_GPL(preempt_notifier_register); @@ -2151,7 +2409,6 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register); void preempt_notifier_unregister(struct preempt_notifier *notifier) { hlist_del(¬ifier->link); - static_key_slow_dec(&preempt_notifier_key); } EXPORT_SYMBOL_GPL(preempt_notifier_unregister); @@ -2267,7 +2524,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) */ prev_state = prev->state; vtime_task_switch(prev); - finish_arch_switch(prev); perf_event_task_sched_in(prev, current); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -2287,30 +2543,42 @@ static struct rq *finish_task_switch(struct task_struct *prev) put_task_struct(prev); } - tick_nohz_task_switch(current); + tick_nohz_task_switch(); return rq; } #ifdef CONFIG_SMP /* rq->lock is NOT held, but preemption is disabled */ -static inline void post_schedule(struct rq *rq) +static void __balance_callback(struct rq *rq) { - if (rq->post_schedule) { - unsigned long flags; + struct callback_head *head, *next; + void (*func)(struct rq *rq); + unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->curr->sched_class->post_schedule) - rq->curr->sched_class->post_schedule(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); + head = rq->balance_callback; + rq->balance_callback = NULL; + while (head) { + func = (void (*)(struct rq *))head->func; + next = head->next; + head->next = NULL; + head = next; - rq->post_schedule = 0; + func(rq); } + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +static inline void balance_callback(struct rq *rq) +{ + if (unlikely(rq->balance_callback)) + __balance_callback(rq); } #else -static inline void post_schedule(struct rq *rq) +static inline void balance_callback(struct rq *rq) { } @@ -2328,7 +2596,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) /* finish_task_switch() drops rq->lock and enables preemtion */ preempt_disable(); rq = finish_task_switch(prev); - post_schedule(rq); + balance_callback(rq); preempt_enable(); if (current->set_child_tid) @@ -2372,6 +2640,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ + lockdep_unpin_lock(&rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ @@ -2794,6 +3063,7 @@ static void __sched __schedule(void) */ smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); + lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -2836,10 +3106,12 @@ static void __sched __schedule(void) rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); - } else + } else { + lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); + } - post_schedule(rq); + balance_callback(rq); } static inline void sched_submit_work(struct task_struct *tsk) @@ -3103,7 +3375,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) check_class_changed(rq, p, prev_class, oldprio); out_unlock: + preempt_disable(); /* avoid rq from going away on us */ __task_rq_unlock(rq); + + balance_callback(rq); + preempt_enable(); } #endif @@ -3441,7 +3717,7 @@ static bool dl_param_changed(struct task_struct *p, static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, - bool user) + bool user, bool pi) { int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; @@ -3627,18 +3903,20 @@ change: p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; - /* - * Take priority boosted tasks into account. If the new - * effective priority is unchanged, we just store the new - * normal parameters and do not touch the scheduler class and - * the runqueue. This will be done when the task deboost - * itself. - */ - new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; + if (pi) { + /* + * Take priority boosted tasks into account. If the new + * effective priority is unchanged, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + if (new_effective_prio == oldprio) { + __setscheduler_params(p, attr); + task_rq_unlock(rq, p, &flags); + return 0; + } } queued = task_on_rq_queued(p); @@ -3649,7 +3927,7 @@ change: put_prev_task(rq, p); prev_class = p->sched_class; - __setscheduler(rq, p, attr, true); + __setscheduler(rq, p, attr, pi); if (running) p->sched_class->set_curr_task(rq); @@ -3662,9 +3940,17 @@ change: } check_class_changed(rq, p, prev_class, oldprio); + preempt_disable(); /* avoid rq from going away on us */ task_rq_unlock(rq, p, &flags); - rt_mutex_adjust_pi(p); + if (pi) + rt_mutex_adjust_pi(p); + + /* + * Run balance callbacks after we've adjusted the PI chain. + */ + balance_callback(rq); + preempt_enable(); return 0; } @@ -3685,7 +3971,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy, attr.sched_policy = policy; } - return __sched_setscheduler(p, &attr, check); + return __sched_setscheduler(p, &attr, check, true); } /** * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. @@ -3706,7 +3992,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { - return __sched_setscheduler(p, attr, true); + return __sched_setscheduler(p, attr, true, true); } EXPORT_SYMBOL_GPL(sched_setattr); @@ -4108,7 +4394,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = set_cpus_allowed_ptr(p, new_mask); + retval = __set_cpus_allowed_ptr(p, new_mask, true); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -4260,7 +4546,7 @@ SYSCALL_DEFINE0(sched_yield) int __sched _cond_resched(void) { - if (should_resched()) { + if (should_resched(0)) { preempt_schedule_common(); return 1; } @@ -4278,7 +4564,7 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { - int resched = should_resched(); + int resched = should_resched(PREEMPT_LOCK_OFFSET); int ret = 0; lockdep_assert_held(lock); @@ -4300,7 +4586,7 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (should_resched()) { + if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { local_bh_enable(); preempt_schedule_common(); local_bh_disable(); @@ -4633,7 +4919,8 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); __sched_fork(0, idle); idle->state = TASK_RUNNING; @@ -4659,7 +4946,8 @@ void init_idle(struct task_struct *idle, int cpu) #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); @@ -4754,149 +5042,6 @@ out: } #ifdef CONFIG_SMP -/* - * move_queued_task - move a queued task to new rq. - * - * Returns (locked) new rq. Old rq's lock is released. - */ -static struct rq *move_queued_task(struct task_struct *p, int new_cpu) -{ - struct rq *rq = task_rq(p); - - lockdep_assert_held(&rq->lock); - - dequeue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_MIGRATING; - set_task_cpu(p, new_cpu); - raw_spin_unlock(&rq->lock); - - rq = cpu_rq(new_cpu); - - raw_spin_lock(&rq->lock); - BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; - enqueue_task(rq, p, 0); - check_preempt_curr(rq, p, 0); - - return rq; -} - -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - - cpumask_copy(&p->cpus_allowed, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); -} - -/* - * This is how migration works: - * - * 1) we invoke migration_cpu_stop() on the target CPU using - * stop_one_cpu(). - * 2) stopper starts to run (implicitly forcing the migrated thread - * off the CPU) - * 3) it checks whether the migrated task is still in the wrong runqueue. - * 4) if it's in the wrong runqueue then the migration thread removes - * it and puts it into the right queue. - * 5) stopper completes and stop_one_cpu() returns and the migration - * is done. - */ - -/* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. - */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - unsigned long flags; - struct rq *rq; - unsigned int dest_cpu; - int ret = 0; - - rq = task_rq_lock(p, &flags); - - if (cpumask_equal(&p->cpus_allowed, new_mask)) - goto out; - - if (!cpumask_intersects(new_mask, cpu_active_mask)) { - ret = -EINVAL; - goto out; - } - - do_set_cpus_allowed(p, new_mask); - - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; - - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &flags); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); - return 0; - } else if (task_on_rq_queued(p)) - rq = move_queued_task(p, dest_cpu); -out: - task_rq_unlock(rq, p, &flags); - - return ret; -} -EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); - -/* - * Move (not current) task off this cpu, onto dest cpu. We're doing - * this because either it can't run here any more (set_cpus_allowed() - * away from this CPU, or CPU going down), or because we're - * attempting to rebalance this task on exec (sched_exec). - * - * So we race with normal scheduler movements, but that's OK, as long - * as the task is no longer on this CPU. - * - * Returns non-zero if task was successfully migrated. - */ -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) -{ - struct rq *rq; - int ret = 0; - - if (unlikely(!cpu_active(dest_cpu))) - return ret; - - rq = cpu_rq(src_cpu); - - raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - /* Already moved. */ - if (task_cpu(p) != src_cpu) - goto done; - - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - goto fail; - - /* - * If we're not on a rq, the next wake-up will ensure we're - * placed properly. - */ - if (task_on_rq_queued(p)) - rq = move_queued_task(p, dest_cpu); -done: - ret = 1; -fail: - raw_spin_unlock(&rq->lock); - raw_spin_unlock(&p->pi_lock); - return ret; -} #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ @@ -4944,35 +5089,9 @@ void sched_setnuma(struct task_struct *p, int nid) enqueue_task(rq, p, 0); task_rq_unlock(rq, p, &flags); } -#endif - -/* - * migration_cpu_stop - this will be executed by a highprio stopper thread - * and performs thread migration by bumping thread off CPU then - * 'pushing' onto another runqueue. - */ -static int migration_cpu_stop(void *data) -{ - struct migration_arg *arg = data; - - /* - * The original target cpu might have gone down and we might - * be on another cpu but it doesn't matter. - */ - local_irq_disable(); - /* - * We need to explicitly wake pending tasks before running - * __migrate_task() such that we will not miss enforcing cpus_allowed - * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. - */ - sched_ttwu_pending(); - __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); - local_irq_enable(); - return 0; -} +#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_HOTPLUG_CPU - /* * Ensures that the idle task is using init_mm right before its cpu goes * offline. @@ -5028,9 +5147,9 @@ static struct task_struct fake_task = { * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(unsigned int dead_cpu) +static void migrate_tasks(struct rq *dead_rq) { - struct rq *rq = cpu_rq(dead_cpu); + struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; @@ -5052,7 +5171,7 @@ static void migrate_tasks(unsigned int dead_cpu) */ update_rq_clock(rq); - for ( ; ; ) { + for (;;) { /* * There's this thread running, bail when that's the only * remaining thread. @@ -5060,22 +5179,29 @@ static void migrate_tasks(unsigned int dead_cpu) if (rq->nr_running == 1) break; + /* + * Ensure rq->lock covers the entire task selection + * until the migration. + */ + lockdep_pin_lock(&rq->lock); next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_cpu, next); - raw_spin_unlock(&rq->lock); - - __migrate_task(next, dead_cpu, dest_cpu); - - raw_spin_lock(&rq->lock); + dest_cpu = select_fallback_rq(dead_rq->cpu, next); + + lockdep_unpin_lock(&rq->lock); + rq = __migrate_task(rq, next, dest_cpu); + if (rq != dead_rq) { + raw_spin_unlock(&rq->lock); + rq = dead_rq; + raw_spin_lock(&rq->lock); + } } rq->stop = stop; } - #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5241,8 +5367,7 @@ static void register_sched_domain_sysctl(void) /* may be called multiple times per register */ static void unregister_sched_domain_sysctl(void) { - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); + unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; if (sd_ctl_dir[0].child) sd_free_ctl_entry(&sd_ctl_dir[0].child); @@ -5254,7 +5379,7 @@ static void register_sched_domain_sysctl(void) static void unregister_sched_domain_sysctl(void) { } -#endif +#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ static void set_rq_online(struct rq *rq) { @@ -5323,7 +5448,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(cpu); + migrate_tasks(rq); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; @@ -5363,6 +5488,14 @@ static int sched_cpu_active(struct notifier_block *nfb, case CPU_STARTING: set_cpu_rq_start_time(); return NOTIFY_OK; + case CPU_ONLINE: + /* + * At this point a starting CPU has marked itself as online via + * set_cpu_online(). But it might not yet have marked itself + * as active, which is essential from here on. + * + * Thus, fall-through and help the starting CPU along. + */ case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -5401,9 +5534,6 @@ static int __init migration_init(void) return 0; } early_initcall(migration_init); -#endif - -#ifdef CONFIG_SMP static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ @@ -6378,8 +6508,10 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (n <= 1) + if (sched_domains_numa_levels <= 1) { sched_numa_topology_type = NUMA_DIRECT; + return; + } for_each_online_node(a) { for_each_online_node(b) { @@ -6629,7 +6761,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) struct sched_group *sg; struct sched_group_capacity *sgc; - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sd) return -ENOMEM; @@ -7235,7 +7367,7 @@ void __init sched_init(void) rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; - rq->post_schedule = 0; + rq->balance_callback = NULL; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; @@ -7365,32 +7497,12 @@ EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ -static void normalize_task(struct rq *rq, struct task_struct *p) +void normalize_rt_tasks(void) { - const struct sched_class *prev_class = p->sched_class; + struct task_struct *g, *p; struct sched_attr attr = { .sched_policy = SCHED_NORMAL, }; - int old_prio = p->prio; - int queued; - - queued = task_on_rq_queued(p); - if (queued) - dequeue_task(rq, p, 0); - __setscheduler(rq, p, &attr, false); - if (queued) { - enqueue_task(rq, p, 0); - resched_curr(rq); - } - - check_class_changed(rq, p, prev_class, old_prio); -} - -void normalize_rt_tasks(void) -{ - struct task_struct *g, *p; - unsigned long flags; - struct rq *rq; read_lock(&tasklist_lock); for_each_process_thread(g, p) { @@ -7417,9 +7529,7 @@ void normalize_rt_tasks(void) continue; } - rq = task_rq_lock(p, &flags); - normalize_task(rq, p); - task_rq_unlock(rq, p, &flags); + __sched_setscheduler(p, &attr, false, false); } read_unlock(&tasklist_lock); } |