aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c170
1 files changed, 120 insertions, 50 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f2cae4620c7..5c883fe8e440 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for (;;) {
/* Any allowed, online CPU? */
for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
- if (!cpu_active(dest_cpu))
+ if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
+ continue;
+ if (!cpu_online(dest_cpu))
continue;
goto out;
}
@@ -1935,7 +1937,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* chain to provide order. Instead we do:
*
* 1) smp_store_release(X->on_cpu, 0)
- * 2) smp_cond_acquire(!X->on_cpu)
+ * 2) smp_cond_load_acquire(!X->on_cpu)
*
* Example:
*
@@ -1946,7 +1948,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* sched-out X
* smp_store_release(X->on_cpu, 0);
*
- * smp_cond_acquire(!X->on_cpu);
+ * smp_cond_load_acquire(&X->on_cpu, !VAL);
* X->state = WAKING
* set_task_cpu(X,2)
*
@@ -1972,7 +1974,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* This means that any means of doing remote wakeups must order the CPU doing
* the wakeup against the CPU the task is going to end up running on. This,
* however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
*
*/
@@ -2045,7 +2047,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
- smp_cond_acquire(!p->on_cpu);
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -2253,9 +2255,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#endif
#endif
+#ifdef CONFIG_SCHEDSTATS
+
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
-#ifdef CONFIG_SCHEDSTATS
static void set_schedstats(bool enabled)
{
if (enabled)
@@ -2278,11 +2282,16 @@ static int __init setup_schedstats(char *str)
if (!str)
goto out;
+ /*
+ * This code is called before jump labels have been set up, so we can't
+ * change the static branch directly just yet. Instead set a temporary
+ * variable so init_schedstats() can do it later.
+ */
if (!strcmp(str, "enable")) {
- set_schedstats(true);
+ __sched_schedstats = true;
ret = 1;
} else if (!strcmp(str, "disable")) {
- set_schedstats(false);
+ __sched_schedstats = false;
ret = 1;
}
out:
@@ -2293,6 +2302,11 @@ out:
}
__setup("schedstats=", setup_schedstats);
+static void __init init_schedstats(void)
+{
+ set_schedstats(__sched_schedstats);
+}
+
#ifdef CONFIG_PROC_SYSCTL
int sysctl_schedstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2313,8 +2327,10 @@ int sysctl_schedstats(struct ctl_table *table, int write,
set_schedstats(state);
return err;
}
-#endif
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
+#else /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
/*
* fork()/clone()-time setup:
@@ -2326,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2367,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_class = &fair_sched_class;
}
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
+ init_entity_runnable_average(&p->se);
/*
* The child is not yet in the pid-hash so no cgroup attach races,
@@ -2378,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
+ /*
+ * We're setting the cpu for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, cpu);
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
@@ -2510,21 +2531,22 @@ void wake_up_new_task(struct task_struct *p)
struct rq_flags rf;
struct rq *rq;
- /* Initialize new task's runnable average */
- init_entity_runnable_average(&p->se);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
- /* Post initialize new task's util average when its cfs_rq is set */
+ rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se);
- rq = __task_rq_lock(p, &rf);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -3146,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_cont("\n");
}
#endif
+ if (panic_on_warn)
+ panic("scheduling while atomic\n");
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -3156,7 +3181,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
static inline void schedule_debug(struct task_struct *prev)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
- BUG_ON(task_stack_end_corrupted(prev));
+ if (task_stack_end_corrupted(prev))
+ panic("corrupted stack end detected inside scheduler\n");
#endif
if (unlikely(in_atomic_preempt_off())) {
@@ -4736,7 +4762,8 @@ out_unlock:
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
*
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -5133,14 +5160,16 @@ void show_state_filter(unsigned long state_filter)
/*
* reset the NMI-timeout, listing all files on a slow
* console might take a lot of time:
+ * Also, reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI.
*/
touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
if (!state_filter || (p->state & state_filter))
sched_show_task(p);
}
- touch_all_softlockup_watchdogs();
-
#ifdef CONFIG_SCHED_DEBUG
if (!state_filter)
sysrq_sched_debug_show();
@@ -5376,13 +5405,15 @@ void idle_task_exit(void)
/*
* Since this CPU is going 'away' for a while, fold any nr_active delta
* we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable.
+ * nr_active count is stable. We need to take the teardown thread which
+ * is calling this into account, so we hand in adjust = 1 to the load
+ * calculation.
*
* Also see the comment "Global load-average calculations".
*/
static void calc_load_migrate(struct rq *rq)
{
- long delta = calc_load_fold_active(rq);
+ long delta = calc_load_fold_active(rq, 1);
if (delta)
atomic_long_add(delta, &calc_load_tasks);
}
@@ -7213,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
rq->calc_load_update = calc_load_update;
- account_reset_rq(rq);
update_max_interval();
}
@@ -7487,6 +7517,8 @@ void __init sched_init(void)
#endif
init_sched_fair_class();
+ init_schedstats();
+
scheduler_running = 1;
}
@@ -7691,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
+
+ online_fair_sched_group(tg);
}
/* rcu callback to free various structures associated with a task group */
@@ -7719,27 +7753,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
{
struct task_group *tg;
- int queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &rf);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- put_prev_task(rq, tsk);
/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7752,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int queued, running;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &rf);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ if (unlikely(running))
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
@@ -8184,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_free_group(tg);
}
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task)
{
- sched_move_task(task);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &rf);
+
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &rf);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
+ int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
@@ -8203,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (task->sched_class != &fair_sched_class)
return -EINVAL;
#endif
+ /*
+ * Serialize against wake_up_new_task() such that if its
+ * running, we're sure to observe its full state.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT, due to
+ * move wanting to detach+attach while we're not attached yet.
+ */
+ if (task->state == TASK_NEW)
+ ret = -EINVAL;
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)