diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 796 | 
1 files changed, 451 insertions, 345 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8465eeab8b3..404c0784b1fc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -33,7 +33,7 @@  #include <linux/init.h>  #include <linux/uaccess.h>  #include <linux/highmem.h> -#include <asm/mmu_context.h> +#include <linux/mmu_context.h>  #include <linux/interrupt.h>  #include <linux/capability.h>  #include <linux/completion.h> @@ -170,6 +170,71 @@ static struct rq *this_rq_lock(void)  	return rq;  } +/* + * __task_rq_lock - lock the rq @p resides on. + */ +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) +	__acquires(rq->lock) +{ +	struct rq *rq; + +	lockdep_assert_held(&p->pi_lock); + +	for (;;) { +		rq = task_rq(p); +		raw_spin_lock(&rq->lock); +		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { +			rf->cookie = lockdep_pin_lock(&rq->lock); +			return rq; +		} +		raw_spin_unlock(&rq->lock); + +		while (unlikely(task_on_rq_migrating(p))) +			cpu_relax(); +	} +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) +	__acquires(p->pi_lock) +	__acquires(rq->lock) +{ +	struct rq *rq; + +	for (;;) { +		raw_spin_lock_irqsave(&p->pi_lock, rf->flags); +		rq = task_rq(p); +		raw_spin_lock(&rq->lock); +		/* +		 *	move_queued_task()		task_rq_lock() +		 * +		 *	ACQUIRE (rq->lock) +		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq() +		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock); +		 *	[S] ->cpu = new_cpu		[L] task_rq() +		 *					[L] ->on_rq +		 *	RELEASE (rq->lock) +		 * +		 * If we observe the old cpu in task_rq_lock, the acquire of +		 * the old rq->lock will fully serialize against the stores. +		 * +		 * If we observe the new cpu in task_rq_lock, the acquire will +		 * pair with the WMB to ensure we must then also see migrating. +		 */ +		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { +			rf->cookie = lockdep_pin_lock(&rq->lock); +			return rq; +		} +		raw_spin_unlock(&rq->lock); +		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); + +		while (unlikely(task_on_rq_migrating(p))) +			cpu_relax(); +	} +} +  #ifdef CONFIG_SCHED_HRTICK  /*   * Use HR-timers to deliver accurate preemption points. @@ -249,29 +314,6 @@ void hrtick_start(struct rq *rq, u64 delay)  	}  } -static int -hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ -	int cpu = (int)(long)hcpu; - -	switch (action) { -	case CPU_UP_CANCELED: -	case CPU_UP_CANCELED_FROZEN: -	case CPU_DOWN_PREPARE: -	case CPU_DOWN_PREPARE_FROZEN: -	case CPU_DEAD: -	case CPU_DEAD_FROZEN: -		hrtick_clear(cpu_rq(cpu)); -		return NOTIFY_OK; -	} - -	return NOTIFY_DONE; -} - -static __init void init_hrtick(void) -{ -	hotcpu_notifier(hotplug_hrtick, 0); -}  #else  /*   * Called to set the hrtick timer state. @@ -288,10 +330,6 @@ void hrtick_start(struct rq *rq, u64 delay)  	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),  		      HRTIMER_MODE_REL_PINNED);  } - -static inline void init_hrtick(void) -{ -}  #endif /* CONFIG_SMP */  static void init_rq_hrtick(struct rq *rq) @@ -315,12 +353,26 @@ static inline void hrtick_clear(struct rq *rq)  static inline void init_rq_hrtick(struct rq *rq)  {  } - -static inline void init_hrtick(void) -{ -}  #endif	/* CONFIG_SCHED_HRTICK */ +/* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, mask)						\ +	({								\ +		typeof(ptr) _ptr = (ptr);				\ +		typeof(mask) _mask = (mask);				\ +		typeof(*_ptr) _old, _val = *_ptr;			\ +									\ +		for (;;) {						\ +			_old = cmpxchg(_ptr, _val, _val | _mask);	\ +			if (_old == _val)				\ +				break;					\ +			_val = _old;					\ +		}							\ +	_old;								\ +}) +  #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)  /*   * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, @@ -382,7 +434,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)  	 * wakeup due to that.  	 *  	 * This cmpxchg() implies a full barrier, which pairs with the write -	 * barrier implied by the wakeup in wake_up_list(). +	 * barrier implied by the wakeup in wake_up_q().  	 */  	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))  		return; @@ -481,7 +533,10 @@ int get_nohz_timer_target(void)  	rcu_read_lock();  	for_each_domain(cpu, sd) {  		for_each_cpu(i, sched_domain_span(sd)) { -			if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { +			if (cpu == i) +				continue; + +			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {  				cpu = i;  				goto unlock;  			} @@ -578,17 +633,8 @@ bool sched_can_stop_tick(struct rq *rq)  		return false;  	/* -	 * FIFO realtime policy runs the highest priority task (after DEADLINE). -	 * Other runnable tasks are of a lower priority. The scheduler tick -	 * isn't needed. -	 */ -	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; -	if (fifo_nr_running) -		return true; - -	/* -	 * Round-robin realtime tasks time slice with other tasks at the same -	 * realtime priority. +	 * If there are more than one RR tasks, we need the tick to effect the +	 * actual RR behaviour.  	 */  	if (rq->rt.rr_nr_running) {  		if (rq->rt.rr_nr_running == 1) @@ -597,8 +643,20 @@ bool sched_can_stop_tick(struct rq *rq)  			return false;  	} -	/* Normal multitasking need periodic preemption checks */ -	if (rq->cfs.nr_running > 1) +	/* +	 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no +	 * forced preemption between FIFO tasks. +	 */ +	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; +	if (fifo_nr_running) +		return true; + +	/* +	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; +	 * if there's more than one we need the tick for involuntary +	 * preemption. +	 */ +	if (rq->nr_running > 1)  		return false;  	return true; @@ -1064,12 +1122,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  static int __set_cpus_allowed_ptr(struct task_struct *p,  				  const struct cpumask *new_mask, bool check)  { -	unsigned long flags; -	struct rq *rq; +	const struct cpumask *cpu_valid_mask = cpu_active_mask;  	unsigned int dest_cpu; +	struct rq_flags rf; +	struct rq *rq;  	int ret = 0; -	rq = task_rq_lock(p, &flags); +	rq = task_rq_lock(p, &rf); + +	if (p->flags & PF_KTHREAD) { +		/* +		 * Kernel threads are allowed on online && !active CPUs +		 */ +		cpu_valid_mask = cpu_online_mask; +	}  	/*  	 * Must re-check here, to close a race against __kthread_bind(), @@ -1083,22 +1149,32 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  	if (cpumask_equal(&p->cpus_allowed, new_mask))  		goto out; -	if (!cpumask_intersects(new_mask, cpu_active_mask)) { +	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {  		ret = -EINVAL;  		goto out;  	}  	do_set_cpus_allowed(p, new_mask); +	if (p->flags & PF_KTHREAD) { +		/* +		 * For kernel threads that do indeed end up on online && +		 * !active we want to ensure they are strict per-cpu threads. +		 */ +		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && +			!cpumask_intersects(new_mask, cpu_active_mask) && +			p->nr_cpus_allowed != 1); +	} +  	/* Can the task run on the task's current CPU? If so, we're done */  	if (cpumask_test_cpu(task_cpu(p), new_mask))  		goto out; -	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); +	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);  	if (task_running(rq, p) || p->state == TASK_WAKING) {  		struct migration_arg arg = { p, dest_cpu };  		/* Need help from migration thread: drop lock and wait. */ -		task_rq_unlock(rq, p, &flags); +		task_rq_unlock(rq, p, &rf);  		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);  		tlb_migrate_finish(p->mm);  		return 0; @@ -1107,12 +1183,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  		 * OK, since we're going to drop the lock immediately  		 * afterwards anyway.  		 */ -		lockdep_unpin_lock(&rq->lock); +		lockdep_unpin_lock(&rq->lock, rf.cookie);  		rq = move_queued_task(rq, p, dest_cpu); -		lockdep_pin_lock(&rq->lock); +		lockdep_repin_lock(&rq->lock, rf.cookie);  	}  out: -	task_rq_unlock(rq, p, &flags); +	task_rq_unlock(rq, p, &rf);  	return ret;  } @@ -1296,8 +1372,8 @@ out:   */  unsigned long wait_task_inactive(struct task_struct *p, long match_state)  { -	unsigned long flags;  	int running, queued; +	struct rq_flags rf;  	unsigned long ncsw;  	struct rq *rq; @@ -1332,14 +1408,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)  		 * lock now, to be *sure*. If we're wrong, we'll  		 * just go back and repeat.  		 */ -		rq = task_rq_lock(p, &flags); +		rq = task_rq_lock(p, &rf);  		trace_sched_wait_task(p);  		running = task_running(rq, p);  		queued = task_on_rq_queued(p);  		ncsw = 0;  		if (!match_state || p->state == match_state)  			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -		task_rq_unlock(rq, p, &flags); +		task_rq_unlock(rq, p, &rf);  		/*  		 * If it changed from the expected state, bail out now. @@ -1413,6 +1489,25 @@ EXPORT_SYMBOL_GPL(kick_process);  /*   * ->cpus_allowed is protected by both rq->lock and p->pi_lock + * + * A few notes on cpu_active vs cpu_online: + * + *  - cpu_active must be a subset of cpu_online + * + *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu, + *    see __set_cpus_allowed_ptr(). At this point the newly online + *    cpu isn't yet part of the sched domains, and balancing will not + *    see it. + * + *  - on cpu-down we clear cpu_active() to mask the sched domains and + *    avoid the load balancer to place new tasks on the to be removed + *    cpu. Existing tasks will remain running there and will be taken + *    off. + * + * This means that fallback selection must not select !active CPUs. + * And can assume that any active CPU must be online. Conversely + * select_task_rq() below may allow selection of !active CPUs in order + * to satisfy the above rules.   */  static int select_fallback_rq(int cpu, struct task_struct *p)  { @@ -1431,8 +1526,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  		/* Look for allowed, online CPU in same node. */  		for_each_cpu(dest_cpu, nodemask) { -			if (!cpu_online(dest_cpu)) -				continue;  			if (!cpu_active(dest_cpu))  				continue;  			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) @@ -1443,8 +1536,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  	for (;;) {  		/* Any allowed, online CPU? */  		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { -			if (!cpu_online(dest_cpu)) -				continue;  			if (!cpu_active(dest_cpu))  				continue;  			goto out; @@ -1494,8 +1585,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)  {  	lockdep_assert_held(&p->pi_lock); -	if (p->nr_cpus_allowed > 1) +	if (tsk_nr_cpus_allowed(p) > 1)  		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); +	else +		cpu = cpumask_any(tsk_cpus_allowed(p));  	/*  	 * In order not to call set_task_cpu() on a blocking task we need @@ -1583,8 +1676,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl  /*   * Mark the task runnable and perform wakeup-preemption.   */ -static void -ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, +			   struct pin_cookie cookie)  {  	check_preempt_curr(rq, p, wake_flags);  	p->state = TASK_RUNNING; @@ -1596,9 +1689,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)  		 * Our task @p is fully woken up and running; so its safe to  		 * drop the rq->lock, hereafter rq is only used for statistics.  		 */ -		lockdep_unpin_lock(&rq->lock); +		lockdep_unpin_lock(&rq->lock, cookie);  		p->sched_class->task_woken(rq, p); -		lockdep_pin_lock(&rq->lock); +		lockdep_repin_lock(&rq->lock, cookie);  	}  	if (rq->idle_stamp) { @@ -1616,17 +1709,23 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)  }  static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, +		 struct pin_cookie cookie)  { +	int en_flags = ENQUEUE_WAKEUP; +  	lockdep_assert_held(&rq->lock);  #ifdef CONFIG_SMP  	if (p->sched_contributes_to_load)  		rq->nr_uninterruptible--; + +	if (wake_flags & WF_MIGRATED) +		en_flags |= ENQUEUE_MIGRATED;  #endif -	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); -	ttwu_do_wakeup(rq, p, wake_flags); +	ttwu_activate(rq, p, en_flags); +	ttwu_do_wakeup(rq, p, wake_flags, cookie);  }  /* @@ -1637,17 +1736,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)   */  static int ttwu_remote(struct task_struct *p, int wake_flags)  { +	struct rq_flags rf;  	struct rq *rq;  	int ret = 0; -	rq = __task_rq_lock(p); +	rq = __task_rq_lock(p, &rf);  	if (task_on_rq_queued(p)) {  		/* check_preempt_curr() may use rq clock */  		update_rq_clock(rq); -		ttwu_do_wakeup(rq, p, wake_flags); +		ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);  		ret = 1;  	} -	__task_rq_unlock(rq); +	__task_rq_unlock(rq, &rf);  	return ret;  } @@ -1657,6 +1757,7 @@ void sched_ttwu_pending(void)  {  	struct rq *rq = this_rq();  	struct llist_node *llist = llist_del_all(&rq->wake_list); +	struct pin_cookie cookie;  	struct task_struct *p;  	unsigned long flags; @@ -1664,15 +1765,19 @@ void sched_ttwu_pending(void)  		return;  	raw_spin_lock_irqsave(&rq->lock, flags); -	lockdep_pin_lock(&rq->lock); +	cookie = lockdep_pin_lock(&rq->lock);  	while (llist) {  		p = llist_entry(llist, struct task_struct, wake_entry);  		llist = llist_next(llist); -		ttwu_do_activate(rq, p, 0); +		/* +		 * See ttwu_queue(); we only call ttwu_queue_remote() when +		 * its a x-cpu wakeup. +		 */ +		ttwu_do_activate(rq, p, WF_MIGRATED, cookie);  	} -	lockdep_unpin_lock(&rq->lock); +	lockdep_unpin_lock(&rq->lock, cookie);  	raw_spin_unlock_irqrestore(&rq->lock, flags);  } @@ -1756,9 +1861,10 @@ bool cpus_share_cache(int this_cpu, int that_cpu)  }  #endif /* CONFIG_SMP */ -static void ttwu_queue(struct task_struct *p, int cpu) +static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)  {  	struct rq *rq = cpu_rq(cpu); +	struct pin_cookie cookie;  #if defined(CONFIG_SMP)  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { @@ -1769,9 +1875,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)  #endif  	raw_spin_lock(&rq->lock); -	lockdep_pin_lock(&rq->lock); -	ttwu_do_activate(rq, p, 0); -	lockdep_unpin_lock(&rq->lock); +	cookie = lockdep_pin_lock(&rq->lock); +	ttwu_do_activate(rq, p, wake_flags, cookie); +	lockdep_unpin_lock(&rq->lock, cookie);  	raw_spin_unlock(&rq->lock);  } @@ -1940,9 +2046,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	p->sched_contributes_to_load = !!task_contributes_to_load(p);  	p->state = TASK_WAKING; -	if (p->sched_class->task_waking) -		p->sched_class->task_waking(p); -  	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);  	if (task_cpu(p) != cpu) {  		wake_flags |= WF_MIGRATED; @@ -1950,7 +2053,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	}  #endif /* CONFIG_SMP */ -	ttwu_queue(p, cpu); +	ttwu_queue(p, cpu, wake_flags);  stat:  	if (schedstat_enabled())  		ttwu_stat(p, cpu, wake_flags); @@ -1968,7 +2071,7 @@ out:   * ensure that this_rq() is locked, @p is bound to this_rq() and not   * the current task.   */ -static void try_to_wake_up_local(struct task_struct *p) +static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)  {  	struct rq *rq = task_rq(p); @@ -1985,11 +2088,11 @@ static void try_to_wake_up_local(struct task_struct *p)  		 * disabled avoiding further scheduler activity on it and we've  		 * not yet picked a replacement task.  		 */ -		lockdep_unpin_lock(&rq->lock); +		lockdep_unpin_lock(&rq->lock, cookie);  		raw_spin_unlock(&rq->lock);  		raw_spin_lock(&p->pi_lock);  		raw_spin_lock(&rq->lock); -		lockdep_pin_lock(&rq->lock); +		lockdep_repin_lock(&rq->lock, cookie);  	}  	if (!(p->state & TASK_NORMAL)) @@ -2000,7 +2103,7 @@ static void try_to_wake_up_local(struct task_struct *p)  	if (!task_on_rq_queued(p))  		ttwu_activate(rq, p, ENQUEUE_WAKEUP); -	ttwu_do_wakeup(rq, p, 0); +	ttwu_do_wakeup(rq, p, 0, cookie);  	if (schedstat_enabled())  		ttwu_stat(p, smp_processor_id(), 0);  out: @@ -2360,7 +2463,8 @@ static int dl_overflow(struct task_struct *p, int policy,  	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;  	int cpus, err = -1; -	if (new_bw == p->dl.dl_bw) +	/* !deadline task may carry old deadline bandwidth */ +	if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))  		return 0;  	/* @@ -2399,12 +2503,12 @@ extern void init_dl_bw(struct dl_bw *dl_b);   */  void wake_up_new_task(struct task_struct *p)  { -	unsigned long flags; +	struct rq_flags rf;  	struct rq *rq; -	raw_spin_lock_irqsave(&p->pi_lock, flags);  	/* Initialize new task's runnable average */  	init_entity_runnable_average(&p->se); +	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);  #ifdef CONFIG_SMP  	/*  	 * Fork balancing, do it here and not earlier because: @@ -2413,8 +2517,10 @@ void wake_up_new_task(struct task_struct *p)  	 */  	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));  #endif +	/* Post initialize new task's util average when its cfs_rq is set */ +	post_init_entity_util_avg(&p->se); -	rq = __task_rq_lock(p); +	rq = __task_rq_lock(p, &rf);  	activate_task(rq, p, 0);  	p->on_rq = TASK_ON_RQ_QUEUED;  	trace_sched_wakeup_new(p); @@ -2425,12 +2531,12 @@ void wake_up_new_task(struct task_struct *p)  		 * Nothing relies on rq->lock after this, so its fine to  		 * drop it.  		 */ -		lockdep_unpin_lock(&rq->lock); +		lockdep_unpin_lock(&rq->lock, rf.cookie);  		p->sched_class->task_woken(rq, p); -		lockdep_pin_lock(&rq->lock); +		lockdep_repin_lock(&rq->lock, rf.cookie);  	}  #endif -	task_rq_unlock(rq, p, &flags); +	task_rq_unlock(rq, p, &rf);  }  #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -2692,7 +2798,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)   */  static __always_inline struct rq *  context_switch(struct rq *rq, struct task_struct *prev, -	       struct task_struct *next) +	       struct task_struct *next, struct pin_cookie cookie)  {  	struct mm_struct *mm, *oldmm; @@ -2712,7 +2818,7 @@ context_switch(struct rq *rq, struct task_struct *prev,  		atomic_inc(&oldmm->mm_count);  		enter_lazy_tlb(oldmm, next);  	} else -		switch_mm(oldmm, mm, next); +		switch_mm_irqs_off(oldmm, mm, next);  	if (!prev->mm) {  		prev->active_mm = NULL; @@ -2724,7 +2830,7 @@ context_switch(struct rq *rq, struct task_struct *prev,  	 * of the scheduler it's an obvious special-case), so we  	 * do an early lockdep release here:  	 */ -	lockdep_unpin_lock(&rq->lock); +	lockdep_unpin_lock(&rq->lock, cookie);  	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);  	/* Here we just switch the register state and the stack. */ @@ -2846,7 +2952,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);   */  unsigned long long task_sched_runtime(struct task_struct *p)  { -	unsigned long flags; +	struct rq_flags rf;  	struct rq *rq;  	u64 ns; @@ -2866,7 +2972,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)  		return p->se.sum_exec_runtime;  #endif -	rq = task_rq_lock(p, &flags); +	rq = task_rq_lock(p, &rf);  	/*  	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would  	 * project cycles that may never be accounted to this @@ -2877,7 +2983,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)  		p->sched_class->update_curr(rq);  	}  	ns = p->se.sum_exec_runtime; -	task_rq_unlock(rq, p, &flags); +	task_rq_unlock(rq, p, &rf);  	return ns;  } @@ -2897,7 +3003,7 @@ void scheduler_tick(void)  	raw_spin_lock(&rq->lock);  	update_rq_clock(rq);  	curr->sched_class->task_tick(rq, curr, 0); -	update_cpu_load_active(rq); +	cpu_load_update_active(rq);  	calc_global_load_tick(rq);  	raw_spin_unlock(&rq->lock); @@ -2940,6 +3046,20 @@ u64 scheduler_tick_max_deferment(void)  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \  				defined(CONFIG_PREEMPT_TRACER)) +/* + * If the value passed in is equal to the current preempt count + * then we just disabled preemption. Start timing the latency. + */ +static inline void preempt_latency_start(int val) +{ +	if (preempt_count() == val) { +		unsigned long ip = get_lock_parent_ip(); +#ifdef CONFIG_DEBUG_PREEMPT +		current->preempt_disable_ip = ip; +#endif +		trace_preempt_off(CALLER_ADDR0, ip); +	} +}  void preempt_count_add(int val)  { @@ -2958,17 +3078,21 @@ void preempt_count_add(int val)  	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=  				PREEMPT_MASK - 10);  #endif -	if (preempt_count() == val) { -		unsigned long ip = get_lock_parent_ip(); -#ifdef CONFIG_DEBUG_PREEMPT -		current->preempt_disable_ip = ip; -#endif -		trace_preempt_off(CALLER_ADDR0, ip); -	} +	preempt_latency_start(val);  }  EXPORT_SYMBOL(preempt_count_add);  NOKPROBE_SYMBOL(preempt_count_add); +/* + * If the value passed in equals to the current preempt count + * then we just enabled preemption. Stop timing the latency. + */ +static inline void preempt_latency_stop(int val) +{ +	if (preempt_count() == val) +		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); +} +  void preempt_count_sub(int val)  {  #ifdef CONFIG_DEBUG_PREEMPT @@ -2985,13 +3109,15 @@ void preempt_count_sub(int val)  		return;  #endif -	if (preempt_count() == val) -		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); +	preempt_latency_stop(val);  	__preempt_count_sub(val);  }  EXPORT_SYMBOL(preempt_count_sub);  NOKPROBE_SYMBOL(preempt_count_sub); +#else +static inline void preempt_latency_start(int val) { } +static inline void preempt_latency_stop(int val) { }  #endif  /* @@ -3044,7 +3170,7 @@ static inline void schedule_debug(struct task_struct *prev)   * Pick up the highest-prio task:   */  static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)  {  	const struct sched_class *class = &fair_sched_class;  	struct task_struct *p; @@ -3055,20 +3181,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev)  	 */  	if (likely(prev->sched_class == class &&  		   rq->nr_running == rq->cfs.h_nr_running)) { -		p = fair_sched_class.pick_next_task(rq, prev); +		p = fair_sched_class.pick_next_task(rq, prev, cookie);  		if (unlikely(p == RETRY_TASK))  			goto again;  		/* assumes fair_sched_class->next == idle_sched_class */  		if (unlikely(!p)) -			p = idle_sched_class.pick_next_task(rq, prev); +			p = idle_sched_class.pick_next_task(rq, prev, cookie);  		return p;  	}  again:  	for_each_class(class) { -		p = class->pick_next_task(rq, prev); +		p = class->pick_next_task(rq, prev, cookie);  		if (p) {  			if (unlikely(p == RETRY_TASK))  				goto again; @@ -3122,6 +3248,7 @@ static void __sched notrace __schedule(bool preempt)  {  	struct task_struct *prev, *next;  	unsigned long *switch_count; +	struct pin_cookie cookie;  	struct rq *rq;  	int cpu; @@ -3155,7 +3282,7 @@ static void __sched notrace __schedule(bool preempt)  	 */  	smp_mb__before_spinlock();  	raw_spin_lock(&rq->lock); -	lockdep_pin_lock(&rq->lock); +	cookie = lockdep_pin_lock(&rq->lock);  	rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -3177,7 +3304,7 @@ static void __sched notrace __schedule(bool preempt)  				to_wakeup = wq_worker_sleeping(prev);  				if (to_wakeup) -					try_to_wake_up_local(to_wakeup); +					try_to_wake_up_local(to_wakeup, cookie);  			}  		}  		switch_count = &prev->nvcsw; @@ -3186,7 +3313,7 @@ static void __sched notrace __schedule(bool preempt)  	if (task_on_rq_queued(prev))  		update_rq_clock(rq); -	next = pick_next_task(rq, prev); +	next = pick_next_task(rq, prev, cookie);  	clear_tsk_need_resched(prev);  	clear_preempt_need_resched();  	rq->clock_skip_update = 0; @@ -3197,9 +3324,9 @@ static void __sched notrace __schedule(bool preempt)  		++*switch_count;  		trace_sched_switch(preempt, prev, next); -		rq = context_switch(rq, prev, next); /* unlocks the rq */ +		rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */  	} else { -		lockdep_unpin_lock(&rq->lock); +		lockdep_unpin_lock(&rq->lock, cookie);  		raw_spin_unlock_irq(&rq->lock);  	} @@ -3266,8 +3393,23 @@ void __sched schedule_preempt_disabled(void)  static void __sched notrace preempt_schedule_common(void)  {  	do { +		/* +		 * Because the function tracer can trace preempt_count_sub() +		 * and it also uses preempt_enable/disable_notrace(), if +		 * NEED_RESCHED is set, the preempt_enable_notrace() called +		 * by the function tracer will call this function again and +		 * cause infinite recursion. +		 * +		 * Preemption must be disabled here before the function +		 * tracer can trace. Break up preempt_disable() into two +		 * calls. One to disable preemption without fear of being +		 * traced. The other to still record the preemption latency, +		 * which can also be traced by the function tracer. +		 */  		preempt_disable_notrace(); +		preempt_latency_start(1);  		__schedule(true); +		preempt_latency_stop(1);  		preempt_enable_no_resched_notrace();  		/* @@ -3319,7 +3461,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)  		return;  	do { +		/* +		 * Because the function tracer can trace preempt_count_sub() +		 * and it also uses preempt_enable/disable_notrace(), if +		 * NEED_RESCHED is set, the preempt_enable_notrace() called +		 * by the function tracer will call this function again and +		 * cause infinite recursion. +		 * +		 * Preemption must be disabled here before the function +		 * tracer can trace. Break up preempt_disable() into two +		 * calls. One to disable preemption without fear of being +		 * traced. The other to still record the preemption latency, +		 * which can also be traced by the function tracer. +		 */  		preempt_disable_notrace(); +		preempt_latency_start(1);  		/*  		 * Needs preempt disabled in case user_exit() is traced  		 * and the tracer calls preempt_enable_notrace() causing @@ -3329,6 +3485,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)  		__schedule(true);  		exception_exit(prev_ctx); +		preempt_latency_stop(1);  		preempt_enable_no_resched_notrace();  	} while (need_resched());  } @@ -3385,12 +3542,13 @@ EXPORT_SYMBOL(default_wake_function);  void rt_mutex_setprio(struct task_struct *p, int prio)  {  	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; -	struct rq *rq;  	const struct sched_class *prev_class; +	struct rq_flags rf; +	struct rq *rq;  	BUG_ON(prio > MAX_PRIO); -	rq = __task_rq_lock(p); +	rq = __task_rq_lock(p, &rf);  	/*  	 * Idle task boosting is a nono in general. There is one @@ -3466,7 +3624,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	check_class_changed(rq, p, prev_class, oldprio);  out_unlock:  	preempt_disable(); /* avoid rq from going away on us */ -	__task_rq_unlock(rq); +	__task_rq_unlock(rq, &rf);  	balance_callback(rq);  	preempt_enable(); @@ -3476,7 +3634,7 @@ out_unlock:  void set_user_nice(struct task_struct *p, long nice)  {  	int old_prio, delta, queued; -	unsigned long flags; +	struct rq_flags rf;  	struct rq *rq;  	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@ -3485,7 +3643,7 @@ void set_user_nice(struct task_struct *p, long nice)  	 * We have to be careful, if called from sys_setpriority(),  	 * the task might be in the middle of scheduling on another CPU.  	 */ -	rq = task_rq_lock(p, &flags); +	rq = task_rq_lock(p, &rf);  	/*  	 * The RT priorities are set via sched_setscheduler(), but we still  	 * allow the 'normal' nice value to be set - but as expected @@ -3516,7 +3674,7 @@ void set_user_nice(struct task_struct *p, long nice)  			resched_curr(rq);  	}  out_unlock: -	task_rq_unlock(rq, p, &flags); +	task_rq_unlock(rq, p, &rf);  }  EXPORT_SYMBOL(set_user_nice); @@ -3813,11 +3971,11 @@ static int __sched_setscheduler(struct task_struct *p,  		      MAX_RT_PRIO - 1 - attr->sched_priority;  	int retval, oldprio, oldpolicy = -1, queued, running;  	int new_effective_prio, policy = attr->sched_policy; -	unsigned long flags;  	const struct sched_class *prev_class; -	struct rq *rq; +	struct rq_flags rf;  	int reset_on_fork;  	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; +	struct rq *rq;  	/* may grab non-irq protected spin_locks */  	BUG_ON(in_interrupt()); @@ -3912,13 +4070,13 @@ recheck:  	 * To be able to change p->policy safely, the appropriate  	 * runqueue lock must be held.  	 */ -	rq = task_rq_lock(p, &flags); +	rq = task_rq_lock(p, &rf);  	/*  	 * Changing the policy of the stop threads its a very bad idea  	 */  	if (p == rq->stop) { -		task_rq_unlock(rq, p, &flags); +		task_rq_unlock(rq, p, &rf);  		return -EINVAL;  	} @@ -3935,7 +4093,7 @@ recheck:  			goto change;  		p->sched_reset_on_fork = reset_on_fork; -		task_rq_unlock(rq, p, &flags); +		task_rq_unlock(rq, p, &rf);  		return 0;  	}  change: @@ -3949,7 +4107,7 @@ change:  		if (rt_bandwidth_enabled() && rt_policy(policy) &&  				task_group(p)->rt_bandwidth.rt_runtime == 0 &&  				!task_group_is_autogroup(task_group(p))) { -			task_rq_unlock(rq, p, &flags); +			task_rq_unlock(rq, p, &rf);  			return -EPERM;  		}  #endif @@ -3964,7 +4122,7 @@ change:  			 */  			if (!cpumask_subset(span, &p->cpus_allowed) ||  			    rq->rd->dl_bw.bw == 0) { -				task_rq_unlock(rq, p, &flags); +				task_rq_unlock(rq, p, &rf);  				return -EPERM;  			}  		} @@ -3974,7 +4132,7 @@ change:  	/* recheck policy now with rq lock held */  	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {  		policy = oldpolicy = -1; -		task_rq_unlock(rq, p, &flags); +		task_rq_unlock(rq, p, &rf);  		goto recheck;  	} @@ -3984,7 +4142,7 @@ change:  	 * is available.  	 */  	if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { -		task_rq_unlock(rq, p, &flags); +		task_rq_unlock(rq, p, &rf);  		return -EBUSY;  	} @@ -4029,7 +4187,7 @@ change:  	check_class_changed(rq, p, prev_class, oldprio);  	preempt_disable(); /* avoid rq from going away on us */ -	task_rq_unlock(rq, p, &flags); +	task_rq_unlock(rq, p, &rf);  	if (pi)  		rt_mutex_adjust_pi(p); @@ -4882,10 +5040,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,  {  	struct task_struct *p;  	unsigned int time_slice; -	unsigned long flags; +	struct rq_flags rf; +	struct timespec t;  	struct rq *rq;  	int retval; -	struct timespec t;  	if (pid < 0)  		return -EINVAL; @@ -4900,11 +5058,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,  	if (retval)  		goto out_unlock; -	rq = task_rq_lock(p, &flags); +	rq = task_rq_lock(p, &rf);  	time_slice = 0;  	if (p->sched_class->get_rr_interval)  		time_slice = p->sched_class->get_rr_interval(rq, p); -	task_rq_unlock(rq, p, &flags); +	task_rq_unlock(rq, p, &rf);  	rcu_read_unlock();  	jiffies_to_timespec(time_slice, &t); @@ -4980,7 +5138,8 @@ void show_state_filter(unsigned long state_filter)  	touch_all_softlockup_watchdogs();  #ifdef CONFIG_SCHED_DEBUG -	sysrq_sched_debug_show(); +	if (!state_filter) +		sysrq_sched_debug_show();  #endif  	rcu_read_unlock();  	/* @@ -5142,6 +5301,8 @@ out:  #ifdef CONFIG_SMP +static bool sched_smp_initialized __read_mostly; +  #ifdef CONFIG_NUMA_BALANCING  /* Migrate current task p to target_cpu */  int migrate_task_to(struct task_struct *p, int target_cpu) @@ -5167,11 +5328,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu)   */  void sched_setnuma(struct task_struct *p, int nid)  { -	struct rq *rq; -	unsigned long flags;  	bool queued, running; +	struct rq_flags rf; +	struct rq *rq; -	rq = task_rq_lock(p, &flags); +	rq = task_rq_lock(p, &rf);  	queued = task_on_rq_queued(p);  	running = task_current(rq, p); @@ -5186,7 +5347,7 @@ void sched_setnuma(struct task_struct *p, int nid)  		p->sched_class->set_curr_task(rq);  	if (queued)  		enqueue_task(rq, p, ENQUEUE_RESTORE); -	task_rq_unlock(rq, p, &flags); +	task_rq_unlock(rq, p, &rf);  }  #endif /* CONFIG_NUMA_BALANCING */ @@ -5202,7 +5363,7 @@ void idle_task_exit(void)  	BUG_ON(cpu_online(smp_processor_id()));  	if (mm != &init_mm) { -		switch_mm(mm, &init_mm, current); +		switch_mm_irqs_off(mm, &init_mm, current);  		finish_arch_post_lock_switch();  	}  	mmdrop(mm); @@ -5250,6 +5411,7 @@ static void migrate_tasks(struct rq *dead_rq)  {  	struct rq *rq = dead_rq;  	struct task_struct *next, *stop = rq->stop; +	struct pin_cookie cookie;  	int dest_cpu;  	/* @@ -5281,8 +5443,8 @@ static void migrate_tasks(struct rq *dead_rq)  		/*  		 * pick_next_task assumes pinned rq->lock.  		 */ -		lockdep_pin_lock(&rq->lock); -		next = pick_next_task(rq, &fake_task); +		cookie = lockdep_pin_lock(&rq->lock); +		next = pick_next_task(rq, &fake_task, cookie);  		BUG_ON(!next);  		next->sched_class->put_prev_task(rq, next); @@ -5295,7 +5457,7 @@ static void migrate_tasks(struct rq *dead_rq)  		 * because !cpu_active at this point, which means load-balance  		 * will not interfere. Also, stop-machine.  		 */ -		lockdep_unpin_lock(&rq->lock); +		lockdep_unpin_lock(&rq->lock, cookie);  		raw_spin_unlock(&rq->lock);  		raw_spin_lock(&next->pi_lock);  		raw_spin_lock(&rq->lock); @@ -5356,127 +5518,13 @@ static void set_rq_offline(struct rq *rq)  	}  } -/* - * migration_call - callback that gets triggered when a CPU is added. - * Here we can start up the necessary migration thread for the new CPU. - */ -static int -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +static void set_cpu_rq_start_time(unsigned int cpu)  { -	int cpu = (long)hcpu; -	unsigned long flags;  	struct rq *rq = cpu_rq(cpu); -	switch (action & ~CPU_TASKS_FROZEN) { - -	case CPU_UP_PREPARE: -		rq->calc_load_update = calc_load_update; -		account_reset_rq(rq); -		break; - -	case CPU_ONLINE: -		/* Update our root-domain */ -		raw_spin_lock_irqsave(&rq->lock, flags); -		if (rq->rd) { -			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - -			set_rq_online(rq); -		} -		raw_spin_unlock_irqrestore(&rq->lock, flags); -		break; - -#ifdef CONFIG_HOTPLUG_CPU -	case CPU_DYING: -		sched_ttwu_pending(); -		/* Update our root-domain */ -		raw_spin_lock_irqsave(&rq->lock, flags); -		if (rq->rd) { -			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -			set_rq_offline(rq); -		} -		migrate_tasks(rq); -		BUG_ON(rq->nr_running != 1); /* the migration thread */ -		raw_spin_unlock_irqrestore(&rq->lock, flags); -		break; - -	case CPU_DEAD: -		calc_load_migrate(rq); -		break; -#endif -	} - -	update_max_interval(); - -	return NOTIFY_OK; -} - -/* - * Register at high priority so that task migration (migrate_all_tasks) - * happens before everything else.  This has to be lower priority than - * the notifier in the perf_event subsystem, though. - */ -static struct notifier_block migration_notifier = { -	.notifier_call = migration_call, -	.priority = CPU_PRI_MIGRATION, -}; - -static void set_cpu_rq_start_time(void) -{ -	int cpu = smp_processor_id(); -	struct rq *rq = cpu_rq(cpu);  	rq->age_stamp = sched_clock_cpu(cpu);  } -static int sched_cpu_active(struct notifier_block *nfb, -				      unsigned long action, void *hcpu) -{ -	int cpu = (long)hcpu; - -	switch (action & ~CPU_TASKS_FROZEN) { -	case CPU_STARTING: -		set_cpu_rq_start_time(); -		return NOTIFY_OK; - -	case CPU_DOWN_FAILED: -		set_cpu_active(cpu, true); -		return NOTIFY_OK; - -	default: -		return NOTIFY_DONE; -	} -} - -static int sched_cpu_inactive(struct notifier_block *nfb, -					unsigned long action, void *hcpu) -{ -	switch (action & ~CPU_TASKS_FROZEN) { -	case CPU_DOWN_PREPARE: -		set_cpu_active((long)hcpu, false); -		return NOTIFY_OK; -	default: -		return NOTIFY_DONE; -	} -} - -static int __init migration_init(void) -{ -	void *cpu = (void *)(long)smp_processor_id(); -	int err; - -	/* Initialize migration for the boot CPU */ -	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); -	BUG_ON(err == NOTIFY_BAD); -	migration_call(&migration_notifier, CPU_ONLINE, cpu); -	register_cpu_notifier(&migration_notifier); - -	/* Register cpu active notifiers */ -	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); -	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); - -	return 0; -} -early_initcall(migration_init); -  static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */  #ifdef CONFIG_SCHED_DEBUG @@ -6624,10 +6672,10 @@ static void sched_init_numa(void)  	init_numa_topology_type();  } -static void sched_domains_numa_masks_set(int cpu) +static void sched_domains_numa_masks_set(unsigned int cpu)  { -	int i, j;  	int node = cpu_to_node(cpu); +	int i, j;  	for (i = 0; i < sched_domains_numa_levels; i++) {  		for (j = 0; j < nr_node_ids; j++) { @@ -6637,51 +6685,20 @@ static void sched_domains_numa_masks_set(int cpu)  	}  } -static void sched_domains_numa_masks_clear(int cpu) +static void sched_domains_numa_masks_clear(unsigned int cpu)  {  	int i, j; +  	for (i = 0; i < sched_domains_numa_levels; i++) {  		for (j = 0; j < nr_node_ids; j++)  			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);  	}  } -/* - * Update sched_domains_numa_masks[level][node] array when new cpus - * are onlined. - */ -static int sched_domains_numa_masks_update(struct notifier_block *nfb, -					   unsigned long action, -					   void *hcpu) -{ -	int cpu = (long)hcpu; - -	switch (action & ~CPU_TASKS_FROZEN) { -	case CPU_ONLINE: -		sched_domains_numa_masks_set(cpu); -		break; - -	case CPU_DEAD: -		sched_domains_numa_masks_clear(cpu); -		break; - -	default: -		return NOTIFY_DONE; -	} - -	return NOTIFY_OK; -}  #else -static inline void sched_init_numa(void) -{ -} - -static int sched_domains_numa_masks_update(struct notifier_block *nfb, -					   unsigned long action, -					   void *hcpu) -{ -	return 0; -} +static inline void sched_init_numa(void) { } +static void sched_domains_numa_masks_set(unsigned int cpu) { } +static void sched_domains_numa_masks_clear(unsigned int cpu) { }  #endif /* CONFIG_NUMA */  static int __sdt_alloc(const struct cpumask *cpu_map) @@ -7071,13 +7088,9 @@ static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */   * If we come here as part of a suspend/resume, don't touch cpusets because we   * want to restore it back to its original state upon resume anyway.   */ -static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, -			     void *hcpu) +static void cpuset_cpu_active(void)  { -	switch (action) { -	case CPU_ONLINE_FROZEN: -	case CPU_DOWN_FAILED_FROZEN: - +	if (cpuhp_tasks_frozen) {  		/*  		 * num_cpus_frozen tracks how many CPUs are involved in suspend  		 * resume sequence. As long as this is not the last online @@ -7087,35 +7100,25 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,  		num_cpus_frozen--;  		if (likely(num_cpus_frozen)) {  			partition_sched_domains(1, NULL, NULL); -			break; +			return;  		} -  		/*  		 * This is the last CPU online operation. So fall through and  		 * restore the original sched domains by considering the  		 * cpuset configurations.  		 */ - -	case CPU_ONLINE: -		cpuset_update_active_cpus(true); -		break; -	default: -		return NOTIFY_DONE;  	} -	return NOTIFY_OK; +	cpuset_update_active_cpus(true);  } -static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, -			       void *hcpu) +static int cpuset_cpu_inactive(unsigned int cpu)  {  	unsigned long flags; -	long cpu = (long)hcpu;  	struct dl_bw *dl_b;  	bool overflow;  	int cpus; -	switch (action) { -	case CPU_DOWN_PREPARE: +	if (!cpuhp_tasks_frozen) {  		rcu_read_lock_sched();  		dl_b = dl_bw_of(cpu); @@ -7127,19 +7130,120 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,  		rcu_read_unlock_sched();  		if (overflow) -			return notifier_from_errno(-EBUSY); +			return -EBUSY;  		cpuset_update_active_cpus(false); -		break; -	case CPU_DOWN_PREPARE_FROZEN: +	} else {  		num_cpus_frozen++;  		partition_sched_domains(1, NULL, NULL); -		break; -	default: -		return NOTIFY_DONE;  	} -	return NOTIFY_OK; +	return 0;  } +int sched_cpu_activate(unsigned int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long flags; + +	set_cpu_active(cpu, true); + +	if (sched_smp_initialized) { +		sched_domains_numa_masks_set(cpu); +		cpuset_cpu_active(); +	} + +	/* +	 * Put the rq online, if not already. This happens: +	 * +	 * 1) In the early boot process, because we build the real domains +	 *    after all cpus have been brought up. +	 * +	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the +	 *    domains. +	 */ +	raw_spin_lock_irqsave(&rq->lock, flags); +	if (rq->rd) { +		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); +		set_rq_online(rq); +	} +	raw_spin_unlock_irqrestore(&rq->lock, flags); + +	update_max_interval(); + +	return 0; +} + +int sched_cpu_deactivate(unsigned int cpu) +{ +	int ret; + +	set_cpu_active(cpu, false); +	/* +	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU +	 * users of this state to go away such that all new such users will +	 * observe it. +	 * +	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might +	 * not imply sync_sched(), so wait for both. +	 * +	 * Do sync before park smpboot threads to take care the rcu boost case. +	 */ +	if (IS_ENABLED(CONFIG_PREEMPT)) +		synchronize_rcu_mult(call_rcu, call_rcu_sched); +	else +		synchronize_rcu(); + +	if (!sched_smp_initialized) +		return 0; + +	ret = cpuset_cpu_inactive(cpu); +	if (ret) { +		set_cpu_active(cpu, true); +		return ret; +	} +	sched_domains_numa_masks_clear(cpu); +	return 0; +} + +static void sched_rq_cpu_starting(unsigned int cpu) +{ +	struct rq *rq = cpu_rq(cpu); + +	rq->calc_load_update = calc_load_update; +	account_reset_rq(rq); +	update_max_interval(); +} + +int sched_cpu_starting(unsigned int cpu) +{ +	set_cpu_rq_start_time(cpu); +	sched_rq_cpu_starting(cpu); +	return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +int sched_cpu_dying(unsigned int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long flags; + +	/* Handle pending wakeups and then migrate everything off */ +	sched_ttwu_pending(); +	raw_spin_lock_irqsave(&rq->lock, flags); +	if (rq->rd) { +		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); +		set_rq_offline(rq); +	} +	migrate_tasks(rq); +	BUG_ON(rq->nr_running != 1); +	raw_spin_unlock_irqrestore(&rq->lock, flags); +	calc_load_migrate(rq); +	update_max_interval(); +	nohz_balance_exit_idle(cpu); +	hrtick_clear(rq); +	return 0; +} +#endif +  void __init sched_init_smp(void)  {  	cpumask_var_t non_isolated_cpus; @@ -7161,12 +7265,6 @@ void __init sched_init_smp(void)  		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);  	mutex_unlock(&sched_domains_mutex); -	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); -	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); -	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - -	init_hrtick(); -  	/* Move init over to a non-isolated CPU */  	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)  		BUG(); @@ -7175,7 +7273,16 @@ void __init sched_init_smp(void)  	init_sched_rt_class();  	init_sched_dl_class(); +	sched_smp_initialized = true; +} + +static int __init migration_init(void) +{ +	sched_rq_cpu_starting(smp_processor_id()); +	return 0;  } +early_initcall(migration_init); +  #else  void __init sched_init_smp(void)  { @@ -7310,8 +7417,6 @@ void __init sched_init(void)  		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)  			rq->cpu_load[j] = 0; -		rq->last_load_update_tick = jiffies; -  #ifdef CONFIG_SMP  		rq->sd = NULL;  		rq->rd = NULL; @@ -7330,12 +7435,13 @@ void __init sched_init(void)  		rq_attach_root(rq, &def_root_domain);  #ifdef CONFIG_NO_HZ_COMMON +		rq->last_load_update_tick = jiffies;  		rq->nohz_flags = 0;  #endif  #ifdef CONFIG_NO_HZ_FULL  		rq->last_sched_tick = 0;  #endif -#endif +#endif /* CONFIG_SMP */  		init_rq_hrtick(rq);  		atomic_set(&rq->nr_iowait, 0);  	} @@ -7373,7 +7479,7 @@ void __init sched_init(void)  	if (cpu_isolated_map == NULL)  		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);  	idle_thread_set_boot_cpu(); -	set_cpu_rq_start_time(); +	set_cpu_rq_start_time(smp_processor_id());  #endif  	init_sched_fair_class(); @@ -7618,10 +7724,10 @@ void sched_move_task(struct task_struct *tsk)  {  	struct task_group *tg;  	int queued, running; -	unsigned long flags; +	struct rq_flags rf;  	struct rq *rq; -	rq = task_rq_lock(tsk, &flags); +	rq = task_rq_lock(tsk, &rf);  	running = task_current(rq, tsk);  	queued = task_on_rq_queued(tsk); @@ -7653,7 +7759,7 @@ void sched_move_task(struct task_struct *tsk)  	if (queued)  		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); -	task_rq_unlock(rq, tsk, &flags); +	task_rq_unlock(rq, tsk, &rf);  }  #endif /* CONFIG_CGROUP_SCHED */ @@ -7873,7 +7979,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)  static int sched_rt_global_constraints(void)  {  	unsigned long flags; -	int i, ret = 0; +	int i;  	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);  	for_each_possible_cpu(i) { @@ -7885,7 +7991,7 @@ static int sched_rt_global_constraints(void)  	}  	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); -	return ret; +	return 0;  }  #endif /* CONFIG_RT_GROUP_SCHED */  |