diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 178 | 
1 files changed, 137 insertions, 41 deletions
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78b4bad10081..2f9c92884817 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {  static void sched_feat_disable(int i)  { -	if (static_key_enabled(&sched_feat_keys[i])) -		static_key_slow_dec(&sched_feat_keys[i]); +	static_key_disable(&sched_feat_keys[i]);  }  static void sched_feat_enable(int i)  { -	if (!static_key_enabled(&sched_feat_keys[i])) -		static_key_slow_inc(&sched_feat_keys[i]); +	static_key_enable(&sched_feat_keys[i]);  }  #else  static void sched_feat_disable(int i) { }; @@ -623,18 +621,21 @@ int get_nohz_timer_target(void)  	int i, cpu = smp_processor_id();  	struct sched_domain *sd; -	if (!idle_cpu(cpu)) +	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))  		return cpu;  	rcu_read_lock();  	for_each_domain(cpu, sd) {  		for_each_cpu(i, sched_domain_span(sd)) { -			if (!idle_cpu(i)) { +			if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {  				cpu = i;  				goto unlock;  			}  		}  	} + +	if (!is_housekeeping_cpu(cpu)) +		cpu = housekeeping_any_cpu();  unlock:  	rcu_read_unlock();  	return cpu; @@ -1151,15 +1152,45 @@ static int migration_cpu_stop(void *data)  	return 0;  } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +/* + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. + */ +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)  { -	if (p->sched_class->set_cpus_allowed) -		p->sched_class->set_cpus_allowed(p, new_mask); -  	cpumask_copy(&p->cpus_allowed, new_mask);  	p->nr_cpus_allowed = cpumask_weight(new_mask);  } +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ +	struct rq *rq = task_rq(p); +	bool queued, running; + +	lockdep_assert_held(&p->pi_lock); + +	queued = task_on_rq_queued(p); +	running = task_current(rq, p); + +	if (queued) { +		/* +		 * Because __kthread_bind() calls this on blocked tasks without +		 * holding rq->lock. +		 */ +		lockdep_assert_held(&rq->lock); +		dequeue_task(rq, p, 0); +	} +	if (running) +		put_prev_task(rq, p); + +	p->sched_class->set_cpus_allowed(p, new_mask); + +	if (running) +		p->sched_class->set_curr_task(rq); +	if (queued) +		enqueue_task(rq, p, 0); +} +  /*   * Change a given task's CPU affinity. Migrate the thread to a   * proper CPU and schedule it away if the CPU it's executing on @@ -1169,7 +1200,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)   * task must not exit() & deallocate itself prematurely. The   * call is not atomic; no spinlocks may be held.   */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +static int __set_cpus_allowed_ptr(struct task_struct *p, +				  const struct cpumask *new_mask, bool check)  {  	unsigned long flags;  	struct rq *rq; @@ -1178,6 +1210,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)  	rq = task_rq_lock(p, &flags); +	/* +	 * Must re-check here, to close a race against __kthread_bind(), +	 * sched_setaffinity() is not guaranteed to observe the flag. +	 */ +	if (check && (p->flags & PF_NO_SETAFFINITY)) { +		ret = -EINVAL; +		goto out; +	} +  	if (cpumask_equal(&p->cpus_allowed, new_mask))  		goto out; @@ -1214,6 +1255,11 @@ out:  	return ret;  } + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ +	return __set_cpus_allowed_ptr(p, new_mask, false); +}  EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);  void set_task_cpu(struct task_struct *p, unsigned int new_cpu) @@ -1595,6 +1641,15 @@ static void update_avg(u64 *avg, u64 sample)  	s64 diff = sample - *avg;  	*avg += diff >> 3;  } + +#else + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, +					 const struct cpumask *new_mask, bool check) +{ +	return set_cpus_allowed_ptr(p, new_mask); +} +  #endif /* CONFIG_SMP */  static void @@ -1654,9 +1709,9 @@ static void  ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)  {  	check_preempt_curr(rq, p, wake_flags); -	trace_sched_wakeup(p, true); -  	p->state = TASK_RUNNING; +	trace_sched_wakeup(p); +  #ifdef CONFIG_SMP  	if (p->sched_class->task_woken) {  		/* @@ -1874,6 +1929,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	if (!(p->state & state))  		goto out; +	trace_sched_waking(p); +  	success = 1; /* we're going to change ->state */  	cpu = task_cpu(p); @@ -1949,6 +2006,8 @@ static void try_to_wake_up_local(struct task_struct *p)  	if (!(p->state & TASK_NORMAL))  		goto out; +	trace_sched_waking(p); +  	if (!task_on_rq_queued(p))  		ttwu_activate(rq, p, ENQUEUE_WAKEUP); @@ -2016,9 +2075,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)  	p->se.prev_sum_exec_runtime	= 0;  	p->se.nr_migrations		= 0;  	p->se.vruntime			= 0; -#ifdef CONFIG_SMP -	p->se.avg.decay_count		= 0; -#endif  	INIT_LIST_HEAD(&p->se.group_node);  #ifdef CONFIG_SCHEDSTATS @@ -2200,8 +2256,8 @@ unsigned long to_ratio(u64 period, u64 runtime)  #ifdef CONFIG_SMP  inline struct dl_bw *dl_bw_of(int i)  { -	rcu_lockdep_assert(rcu_read_lock_sched_held(), -			   "sched RCU must be held"); +	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), +			 "sched RCU must be held");  	return &cpu_rq(i)->rd->dl_bw;  } @@ -2210,8 +2266,8 @@ static inline int dl_bw_cpus(int i)  	struct root_domain *rd = cpu_rq(i)->rd;  	int cpus = 0; -	rcu_lockdep_assert(rcu_read_lock_sched_held(), -			   "sched RCU must be held"); +	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), +			 "sched RCU must be held");  	for_each_cpu_and(i, rd->span, cpu_active_mask)  		cpus++; @@ -2303,11 +2359,11 @@ void wake_up_new_task(struct task_struct *p)  #endif  	/* Initialize new task's runnable average */ -	init_task_runnable_average(p); +	init_entity_runnable_average(&p->se);  	rq = __task_rq_lock(p);  	activate_task(rq, p, 0);  	p->on_rq = TASK_ON_RQ_QUEUED; -	trace_sched_wakeup_new(p, true); +	trace_sched_wakeup_new(p);  	check_preempt_curr(rq, p, WF_FORK);  #ifdef CONFIG_SMP  	if (p->sched_class->task_woken) @@ -2469,7 +2525,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)  	 */  	prev_state = prev->state;  	vtime_task_switch(prev); -	finish_arch_switch(prev);  	perf_event_task_sched_in(prev, current);  	finish_lock_switch(rq, prev);  	finish_arch_post_lock_switch(); @@ -2489,7 +2544,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)  		put_task_struct(prev);  	} -	tick_nohz_task_switch(current); +	tick_nohz_task_switch();  	return rq;  } @@ -2614,13 +2669,20 @@ unsigned long nr_running(void)  /*   * Check if only the current task is running on the cpu. + * + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race.  The caller is responsible to use it correctly, for example: + * + * - from a non-preemptable section (of course) + * + * - from a thread that is bound to a single CPU + * + * - in a loop with very short iterations (e.g. a polling loop)   */  bool single_task_running(void)  { -	if (cpu_rq(smp_processor_id())->nr_running == 1) -		return true; -	else -		return false; +	return raw_rq()->nr_running == 1;  }  EXPORT_SYMBOL(single_task_running); @@ -4340,7 +4402,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  	}  #endif  again: -	retval = set_cpus_allowed_ptr(p, new_mask); +	retval = __set_cpus_allowed_ptr(p, new_mask, true);  	if (!retval) {  		cpuset_cpus_allowed(p, cpus_allowed); @@ -4492,7 +4554,7 @@ SYSCALL_DEFINE0(sched_yield)  int __sched _cond_resched(void)  { -	if (should_resched()) { +	if (should_resched(0)) {  		preempt_schedule_common();  		return 1;  	} @@ -4510,7 +4572,7 @@ EXPORT_SYMBOL(_cond_resched);   */  int __cond_resched_lock(spinlock_t *lock)  { -	int resched = should_resched(); +	int resched = should_resched(PREEMPT_LOCK_OFFSET);  	int ret = 0;  	lockdep_assert_held(lock); @@ -4532,7 +4594,7 @@ int __sched __cond_resched_softirq(void)  {  	BUG_ON(!in_softirq()); -	if (should_resched()) { +	if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {  		local_bh_enable();  		preempt_schedule_common();  		local_bh_disable(); @@ -4865,7 +4927,8 @@ void init_idle(struct task_struct *idle, int cpu)  	struct rq *rq = cpu_rq(cpu);  	unsigned long flags; -	raw_spin_lock_irqsave(&rq->lock, flags); +	raw_spin_lock_irqsave(&idle->pi_lock, flags); +	raw_spin_lock(&rq->lock);  	__sched_fork(0, idle);  	idle->state = TASK_RUNNING; @@ -4891,7 +4954,8 @@ void init_idle(struct task_struct *idle, int cpu)  #if defined(CONFIG_SMP)  	idle->on_cpu = 1;  #endif -	raw_spin_unlock_irqrestore(&rq->lock, flags); +	raw_spin_unlock(&rq->lock); +	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);  	/* Set the preempt count _outside_ the spinlocks! */  	init_idle_preempt_count(idle, cpu); @@ -5124,24 +5188,47 @@ static void migrate_tasks(struct rq *dead_rq)  			break;  		/* -		 * Ensure rq->lock covers the entire task selection -		 * until the migration. +		 * pick_next_task assumes pinned rq->lock.  		 */  		lockdep_pin_lock(&rq->lock);  		next = pick_next_task(rq, &fake_task);  		BUG_ON(!next);  		next->sched_class->put_prev_task(rq, next); +		/* +		 * Rules for changing task_struct::cpus_allowed are holding +		 * both pi_lock and rq->lock, such that holding either +		 * stabilizes the mask. +		 * +		 * Drop rq->lock is not quite as disastrous as it usually is +		 * because !cpu_active at this point, which means load-balance +		 * will not interfere. Also, stop-machine. +		 */ +		lockdep_unpin_lock(&rq->lock); +		raw_spin_unlock(&rq->lock); +		raw_spin_lock(&next->pi_lock); +		raw_spin_lock(&rq->lock); + +		/* +		 * Since we're inside stop-machine, _nothing_ should have +		 * changed the task, WARN if weird stuff happened, because in +		 * that case the above rq->lock drop is a fail too. +		 */ +		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { +			raw_spin_unlock(&next->pi_lock); +			continue; +		} +  		/* Find suitable destination for @next, with force if needed. */  		dest_cpu = select_fallback_rq(dead_rq->cpu, next); -		lockdep_unpin_lock(&rq->lock);  		rq = __migrate_task(rq, next, dest_cpu);  		if (rq != dead_rq) {  			raw_spin_unlock(&rq->lock);  			rq = dead_rq;  			raw_spin_lock(&rq->lock);  		} +		raw_spin_unlock(&next->pi_lock);  	}  	rq->stop = stop; @@ -5311,8 +5398,7 @@ static void register_sched_domain_sysctl(void)  /* may be called multiple times per register */  static void unregister_sched_domain_sysctl(void)  { -	if (sd_sysctl_header) -		unregister_sysctl_table(sd_sysctl_header); +	unregister_sysctl_table(sd_sysctl_header);  	sd_sysctl_header = NULL;  	if (sd_ctl_dir[0].child)  		sd_free_ctl_entry(&sd_ctl_dir[0].child); @@ -5433,6 +5519,14 @@ static int sched_cpu_active(struct notifier_block *nfb,  	case CPU_STARTING:  		set_cpu_rq_start_time();  		return NOTIFY_OK; +	case CPU_ONLINE: +		/* +		 * At this point a starting CPU has marked itself as online via +		 * set_cpu_online(). But it might not yet have marked itself +		 * as active, which is essential from here on. +		 * +		 * Thus, fall-through and help the starting CPU along. +		 */  	case CPU_DOWN_FAILED:  		set_cpu_active((long)hcpu, true);  		return NOTIFY_OK; @@ -6445,8 +6539,10 @@ static void init_numa_topology_type(void)  	n = sched_max_numa_distance; -	if (n <= 1) +	if (sched_domains_numa_levels <= 1) {  		sched_numa_topology_type = NUMA_DIRECT; +		return; +	}  	for_each_online_node(a) {  		for_each_online_node(b) { @@ -8068,7 +8164,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)  	sched_offline_group(tg);  } -static void cpu_cgroup_fork(struct task_struct *task) +static void cpu_cgroup_fork(struct task_struct *task, void *private)  {  	sched_move_task(task);  } |