diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/completion.c | 36 | ||||
| -rw-r--r-- | kernel/sched/core.c | 55 | ||||
| -rw-r--r-- | kernel/sched/cpupri.c | 158 | ||||
| -rw-r--r-- | kernel/sched/cpupri.h | 6 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 45 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 6 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 61 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 839 | ||||
| -rw-r--r-- | kernel/sched/isolation.c | 21 | ||||
| -rw-r--r-- | kernel/sched/pelt.c | 90 | ||||
| -rw-r--r-- | kernel/sched/pelt.h | 31 | ||||
| -rw-r--r-- | kernel/sched/psi.c | 111 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 66 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 80 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 31 | ||||
| -rw-r--r-- | kernel/sched/swait.c | 15 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 27 | 
17 files changed, 1120 insertions, 558 deletions
| diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a1ad5b7d5521..a778554f9dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -29,12 +29,12 @@ void complete(struct completion *x)  {  	unsigned long flags; -	spin_lock_irqsave(&x->wait.lock, flags); +	raw_spin_lock_irqsave(&x->wait.lock, flags);  	if (x->done != UINT_MAX)  		x->done++; -	__wake_up_locked(&x->wait, TASK_NORMAL, 1); -	spin_unlock_irqrestore(&x->wait.lock, flags); +	swake_up_locked(&x->wait); +	raw_spin_unlock_irqrestore(&x->wait.lock, flags);  }  EXPORT_SYMBOL(complete); @@ -58,10 +58,12 @@ void complete_all(struct completion *x)  {  	unsigned long flags; -	spin_lock_irqsave(&x->wait.lock, flags); +	lockdep_assert_RT_in_threaded_ctx(); + +	raw_spin_lock_irqsave(&x->wait.lock, flags);  	x->done = UINT_MAX; -	__wake_up_locked(&x->wait, TASK_NORMAL, 0); -	spin_unlock_irqrestore(&x->wait.lock, flags); +	swake_up_all_locked(&x->wait); +	raw_spin_unlock_irqrestore(&x->wait.lock, flags);  }  EXPORT_SYMBOL(complete_all); @@ -70,20 +72,20 @@ do_wait_for_common(struct completion *x,  		   long (*action)(long), long timeout, int state)  {  	if (!x->done) { -		DECLARE_WAITQUEUE(wait, current); +		DECLARE_SWAITQUEUE(wait); -		__add_wait_queue_entry_tail_exclusive(&x->wait, &wait);  		do {  			if (signal_pending_state(state, current)) {  				timeout = -ERESTARTSYS;  				break;  			} +			__prepare_to_swait(&x->wait, &wait);  			__set_current_state(state); -			spin_unlock_irq(&x->wait.lock); +			raw_spin_unlock_irq(&x->wait.lock);  			timeout = action(timeout); -			spin_lock_irq(&x->wait.lock); +			raw_spin_lock_irq(&x->wait.lock);  		} while (!x->done && timeout); -		__remove_wait_queue(&x->wait, &wait); +		__finish_swait(&x->wait, &wait);  		if (!x->done)  			return timeout;  	} @@ -100,9 +102,9 @@ __wait_for_common(struct completion *x,  	complete_acquire(x); -	spin_lock_irq(&x->wait.lock); +	raw_spin_lock_irq(&x->wait.lock);  	timeout = do_wait_for_common(x, action, timeout, state); -	spin_unlock_irq(&x->wait.lock); +	raw_spin_unlock_irq(&x->wait.lock);  	complete_release(x); @@ -291,12 +293,12 @@ bool try_wait_for_completion(struct completion *x)  	if (!READ_ONCE(x->done))  		return false; -	spin_lock_irqsave(&x->wait.lock, flags); +	raw_spin_lock_irqsave(&x->wait.lock, flags);  	if (!x->done)  		ret = false;  	else if (x->done != UINT_MAX)  		x->done--; -	spin_unlock_irqrestore(&x->wait.lock, flags); +	raw_spin_unlock_irqrestore(&x->wait.lock, flags);  	return ret;  }  EXPORT_SYMBOL(try_wait_for_completion); @@ -322,8 +324,8 @@ bool completion_done(struct completion *x)  	 * otherwise we can end up freeing the completion before complete()  	 * is done referencing it.  	 */ -	spin_lock_irqsave(&x->wait.lock, flags); -	spin_unlock_irqrestore(&x->wait.lock, flags); +	raw_spin_lock_irqsave(&x->wait.lock, flags); +	raw_spin_unlock_irqrestore(&x->wait.lock, flags);  	return true;  }  EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a9983da4408..9a2fbf98fd6f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -269,7 +269,6 @@ static void __hrtick_start(void *arg)  	rq_lock(rq, &rf);  	__hrtick_restart(rq); -	rq->hrtick_csd_pending = 0;  	rq_unlock(rq, &rf);  } @@ -293,12 +292,10 @@ void hrtick_start(struct rq *rq, u64 delay)  	hrtimer_set_expires(timer, time); -	if (rq == this_rq()) { +	if (rq == this_rq())  		__hrtick_restart(rq); -	} else if (!rq->hrtick_csd_pending) { +	else  		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); -		rq->hrtick_csd_pending = 1; -	}  }  #else @@ -322,8 +319,6 @@ void hrtick_start(struct rq *rq, u64 delay)  static void hrtick_rq_init(struct rq *rq)  {  #ifdef CONFIG_SMP -	rq->hrtick_csd_pending = 0; -  	rq->hrtick_csd.flags = 0;  	rq->hrtick_csd.func = __hrtick_start;  	rq->hrtick_csd.info = rq; @@ -761,7 +756,6 @@ static void set_load_weight(struct task_struct *p, bool update_load)  	if (task_has_idle_policy(p)) {  		load->weight = scale_load(WEIGHT_IDLEPRIO);  		load->inv_weight = WMULT_IDLEPRIO; -		p->se.runnable_weight = load->weight;  		return;  	} @@ -774,7 +768,6 @@ static void set_load_weight(struct task_struct *p, bool update_load)  	} else {  		load->weight = scale_load(sched_prio_to_weight[prio]);  		load->inv_weight = sched_prio_to_wmult[prio]; -		p->se.runnable_weight = load->weight;  	}  } @@ -1239,13 +1232,8 @@ static void uclamp_fork(struct task_struct *p)  		return;  	for_each_clamp_id(clamp_id) { -		unsigned int clamp_value = uclamp_none(clamp_id); - -		/* By default, RT tasks always get 100% boost */ -		if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) -			clamp_value = uclamp_none(UCLAMP_MAX); - -		uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false); +		uclamp_se_set(&p->uclamp_req[clamp_id], +			      uclamp_none(clamp_id), false);  	}  } @@ -1652,7 +1640,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  	if (cpumask_equal(p->cpus_ptr, new_mask))  		goto out; -	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); +	/* +	 * Picking a ~random cpu helps in cases where we are changing affinity +	 * for groups of tasks (ie. cpuset), so that load balancing is not +	 * immediately required to distribute the tasks within their new mask. +	 */ +	dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);  	if (dest_cpu >= nr_cpu_ids) {  		ret = -EINVAL;  		goto out; @@ -2121,12 +2114,6 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)  	return cpu;  } -static void update_avg(u64 *avg, u64 sample) -{ -	s64 diff = sample - *avg; -	*avg += diff >> 3; -} -  void sched_set_stop_task(int cpu, struct task_struct *stop)  {  	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -3578,6 +3565,17 @@ unsigned long long task_sched_runtime(struct task_struct *p)  	return ns;  } +DEFINE_PER_CPU(unsigned long, thermal_pressure); + +void arch_set_thermal_pressure(struct cpumask *cpus, +			       unsigned long th_pressure) +{ +	int cpu; + +	for_each_cpu(cpu, cpus) +		WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); +} +  /*   * This function gets called by the timer code, with HZ frequency.   * We call it with interrupts disabled. @@ -3588,12 +3586,16 @@ void scheduler_tick(void)  	struct rq *rq = cpu_rq(cpu);  	struct task_struct *curr = rq->curr;  	struct rq_flags rf; +	unsigned long thermal_pressure; +	arch_scale_freq_tick();  	sched_clock_tick();  	rq_lock(rq, &rf);  	update_rq_clock(rq); +	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); +	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);  	curr->sched_class->task_tick(rq, curr, 0);  	calc_global_load_tick(rq);  	psi_task_tick(rq); @@ -3671,7 +3673,6 @@ static void sched_tick_remote(struct work_struct *work)  	if (cpu_is_offline(cpu))  		goto out_unlock; -	curr = rq->curr;  	update_rq_clock(rq);  	if (!is_idle_task(curr)) { @@ -4074,6 +4075,8 @@ static void __sched notrace __schedule(bool preempt)  		 */  		++*switch_count; +		psi_sched_switch(prev, next, !task_on_rq_queued(prev)); +  		trace_sched_switch(preempt, prev, next);  		/* Also unlocks the rq: */ @@ -4112,7 +4115,8 @@ static inline void sched_submit_work(struct task_struct *tsk)  	 * it wants to wake up a task to maintain concurrency.  	 * As this function is called inside the schedule() context,  	 * we disable preemption to avoid it calling schedule() again -	 * in the possible wakeup of a kworker. +	 * in the possible wakeup of a kworker and because wq_worker_sleeping() +	 * requires it.  	 */  	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {  		preempt_disable(); @@ -6685,7 +6689,6 @@ void __init sched_init(void)  		rq_attach_root(rq, &def_root_domain);  #ifdef CONFIG_NO_HZ_COMMON -		rq->last_load_update_tick = jiffies;  		rq->last_blocked_load_update_tick = jiffies;  		atomic_set(&rq->nohz_flags, 0);  #endif diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 1a2719e1350a..0033731a0797 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -41,8 +41,67 @@ static int convert_prio(int prio)  	return cpupri;  } +static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, +				struct cpumask *lowest_mask, int idx) +{ +	struct cpupri_vec *vec  = &cp->pri_to_cpu[idx]; +	int skip = 0; + +	if (!atomic_read(&(vec)->count)) +		skip = 1; +	/* +	 * When looking at the vector, we need to read the counter, +	 * do a memory barrier, then read the mask. +	 * +	 * Note: This is still all racey, but we can deal with it. +	 *  Ideally, we only want to look at masks that are set. +	 * +	 *  If a mask is not set, then the only thing wrong is that we +	 *  did a little more work than necessary. +	 * +	 *  If we read a zero count but the mask is set, because of the +	 *  memory barriers, that can only happen when the highest prio +	 *  task for a run queue has left the run queue, in which case, +	 *  it will be followed by a pull. If the task we are processing +	 *  fails to find a proper place to go, that pull request will +	 *  pull this task if the run queue is running at a lower +	 *  priority. +	 */ +	smp_rmb(); + +	/* Need to do the rmb for every iteration */ +	if (skip) +		return 0; + +	if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) +		return 0; + +	if (lowest_mask) { +		cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); + +		/* +		 * We have to ensure that we have at least one bit +		 * still set in the array, since the map could have +		 * been concurrently emptied between the first and +		 * second reads of vec->mask.  If we hit this +		 * condition, simply act as though we never hit this +		 * priority level and continue on. +		 */ +		if (cpumask_empty(lowest_mask)) +			return 0; +	} + +	return 1; +} + +int cpupri_find(struct cpupri *cp, struct task_struct *p, +		struct cpumask *lowest_mask) +{ +	return cpupri_find_fitness(cp, p, lowest_mask, NULL); +} +  /** - * cpupri_find - find the best (lowest-pri) CPU in the system + * cpupri_find_fitness - find the best (lowest-pri) CPU in the system   * @cp: The cpupri context   * @p: The task   * @lowest_mask: A mask to fill in with selected CPUs (or NULL) @@ -58,84 +117,59 @@ static int convert_prio(int prio)   *   * Return: (int)bool - CPUs were found   */ -int cpupri_find(struct cpupri *cp, struct task_struct *p, +int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,  		struct cpumask *lowest_mask,  		bool (*fitness_fn)(struct task_struct *p, int cpu))  { -	int idx = 0;  	int task_pri = convert_prio(p->prio); +	int idx, cpu;  	BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);  	for (idx = 0; idx < task_pri; idx++) { -		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx]; -		int skip = 0; - -		if (!atomic_read(&(vec)->count)) -			skip = 1; -		/* -		 * When looking at the vector, we need to read the counter, -		 * do a memory barrier, then read the mask. -		 * -		 * Note: This is still all racey, but we can deal with it. -		 *  Ideally, we only want to look at masks that are set. -		 * -		 *  If a mask is not set, then the only thing wrong is that we -		 *  did a little more work than necessary. -		 * -		 *  If we read a zero count but the mask is set, because of the -		 *  memory barriers, that can only happen when the highest prio -		 *  task for a run queue has left the run queue, in which case, -		 *  it will be followed by a pull. If the task we are processing -		 *  fails to find a proper place to go, that pull request will -		 *  pull this task if the run queue is running at a lower -		 *  priority. -		 */ -		smp_rmb(); -		/* Need to do the rmb for every iteration */ -		if (skip) +		if (!__cpupri_find(cp, p, lowest_mask, idx))  			continue; -		if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) -			continue; +		if (!lowest_mask || !fitness_fn) +			return 1; -		if (lowest_mask) { -			int cpu; - -			cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); - -			/* -			 * We have to ensure that we have at least one bit -			 * still set in the array, since the map could have -			 * been concurrently emptied between the first and -			 * second reads of vec->mask.  If we hit this -			 * condition, simply act as though we never hit this -			 * priority level and continue on. -			 */ -			if (cpumask_empty(lowest_mask)) -				continue; - -			if (!fitness_fn) -				return 1; - -			/* Ensure the capacity of the CPUs fit the task */ -			for_each_cpu(cpu, lowest_mask) { -				if (!fitness_fn(p, cpu)) -					cpumask_clear_cpu(cpu, lowest_mask); -			} - -			/* -			 * If no CPU at the current priority can fit the task -			 * continue looking -			 */ -			if (cpumask_empty(lowest_mask)) -				continue; +		/* Ensure the capacity of the CPUs fit the task */ +		for_each_cpu(cpu, lowest_mask) { +			if (!fitness_fn(p, cpu)) +				cpumask_clear_cpu(cpu, lowest_mask);  		} +		/* +		 * If no CPU at the current priority can fit the task +		 * continue looking +		 */ +		if (cpumask_empty(lowest_mask)) +			continue; +  		return 1;  	} +	/* +	 * If we failed to find a fitting lowest_mask, kick off a new search +	 * but without taking into account any fitness criteria this time. +	 * +	 * This rule favours honouring priority over fitting the task in the +	 * correct CPU (Capacity Awareness being the only user now). +	 * The idea is that if a higher priority task can run, then it should +	 * run even if this ends up being on unfitting CPU. +	 * +	 * The cost of this trade-off is not entirely clear and will probably +	 * be good for some workloads and bad for others. +	 * +	 * The main idea here is that if some CPUs were overcommitted, we try +	 * to spread which is what the scheduler traditionally did. Sys admins +	 * must do proper RT planning to avoid overloading the system if they +	 * really care. +	 */ +	if (fitness_fn) +		return cpupri_find(cp, p, lowest_mask); +  	return 0;  } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 32dd520db11f..efbb492bb94c 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -19,8 +19,10 @@ struct cpupri {  #ifdef CONFIG_SMP  int  cpupri_find(struct cpupri *cp, struct task_struct *p, -		 struct cpumask *lowest_mask, -		 bool (*fitness_fn)(struct task_struct *p, int cpu)); +		 struct cpumask *lowest_mask); +int  cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, +			 struct cpumask *lowest_mask, +			 bool (*fitness_fn)(struct task_struct *p, int cpu));  void cpupri_set(struct cpupri *cp, int cpu, int pri);  int  cpupri_init(struct cpupri *cp);  void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cff3e656566d..ff9435dee1df 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -909,8 +909,10 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)  	} while (read_seqcount_retry(&vtime->seqcount, seq));  } -static int vtime_state_check(struct vtime *vtime, int cpu) +static int vtime_state_fetch(struct vtime *vtime, int cpu)  { +	int state = READ_ONCE(vtime->state); +  	/*  	 * We raced against a context switch, fetch the  	 * kcpustat task again. @@ -927,10 +929,10 @@ static int vtime_state_check(struct vtime *vtime, int cpu)  	 *  	 * Case 1) is ok but 2) is not. So wait for a safe VTIME state.  	 */ -	if (vtime->state == VTIME_INACTIVE) +	if (state == VTIME_INACTIVE)  		return -EAGAIN; -	return 0; +	return state;  }  static u64 kcpustat_user_vtime(struct vtime *vtime) @@ -949,14 +951,15 @@ static int kcpustat_field_vtime(u64 *cpustat,  {  	struct vtime *vtime = &tsk->vtime;  	unsigned int seq; -	int err;  	do { +		int state; +  		seq = read_seqcount_begin(&vtime->seqcount); -		err = vtime_state_check(vtime, cpu); -		if (err < 0) -			return err; +		state = vtime_state_fetch(vtime, cpu); +		if (state < 0) +			return state;  		*val = cpustat[usage]; @@ -969,7 +972,7 @@ static int kcpustat_field_vtime(u64 *cpustat,  		 */  		switch (usage) {  		case CPUTIME_SYSTEM: -			if (vtime->state == VTIME_SYS) +			if (state == VTIME_SYS)  				*val += vtime->stime + vtime_delta(vtime);  			break;  		case CPUTIME_USER: @@ -981,11 +984,11 @@ static int kcpustat_field_vtime(u64 *cpustat,  				*val += kcpustat_user_vtime(vtime);  			break;  		case CPUTIME_GUEST: -			if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0) +			if (state == VTIME_GUEST && task_nice(tsk) <= 0)  				*val += vtime->gtime + vtime_delta(vtime);  			break;  		case CPUTIME_GUEST_NICE: -			if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0) +			if (state == VTIME_GUEST && task_nice(tsk) > 0)  				*val += vtime->gtime + vtime_delta(vtime);  			break;  		default: @@ -1000,12 +1003,12 @@ u64 kcpustat_field(struct kernel_cpustat *kcpustat,  		   enum cpu_usage_stat usage, int cpu)  {  	u64 *cpustat = kcpustat->cpustat; +	u64 val = cpustat[usage];  	struct rq *rq; -	u64 val;  	int err;  	if (!vtime_accounting_enabled_cpu(cpu)) -		return cpustat[usage]; +		return val;  	rq = cpu_rq(cpu); @@ -1036,23 +1039,23 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,  {  	struct vtime *vtime = &tsk->vtime;  	unsigned int seq; -	int err;  	do {  		u64 *cpustat;  		u64 delta; +		int state;  		seq = read_seqcount_begin(&vtime->seqcount); -		err = vtime_state_check(vtime, cpu); -		if (err < 0) -			return err; +		state = vtime_state_fetch(vtime, cpu); +		if (state < 0) +			return state;  		*dst = *src;  		cpustat = dst->cpustat;  		/* Task is sleeping, dead or idle, nothing to add */ -		if (vtime->state < VTIME_SYS) +		if (state < VTIME_SYS)  			continue;  		delta = vtime_delta(vtime); @@ -1061,15 +1064,15 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,  		 * Task runs either in user (including guest) or kernel space,  		 * add pending nohz time to the right place.  		 */ -		if (vtime->state == VTIME_SYS) { +		if (state == VTIME_SYS) {  			cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; -		} else if (vtime->state == VTIME_USER) { +		} else if (state == VTIME_USER) {  			if (task_nice(tsk) > 0)  				cpustat[CPUTIME_NICE] += vtime->utime + delta;  			else  				cpustat[CPUTIME_USER] += vtime->utime + delta;  		} else { -			WARN_ON_ONCE(vtime->state != VTIME_GUEST); +			WARN_ON_ONCE(state != VTIME_GUEST);  			if (task_nice(tsk) > 0) {  				cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;  				cpustat[CPUTIME_NICE] += vtime->gtime + delta; @@ -1080,7 +1083,7 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,  		}  	} while (read_seqcount_retry(&vtime->seqcount, seq)); -	return err; +	return 0;  }  void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 43323f875cb9..504d2f51b0d6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -153,7 +153,7 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)  		__sub_running_bw(dl_se->dl_bw, dl_rq);  } -void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_change_utilization(struct task_struct *p, u64 new_bw)  {  	struct rq *rq; @@ -334,6 +334,8 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)  	return dl_rq->root.rb_leftmost == &dl_se->rb_node;  } +static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); +  void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)  {  	raw_spin_lock_init(&dl_b->dl_runtime_lock); @@ -2496,7 +2498,7 @@ int sched_dl_global_validate(void)  	return ret;  } -void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)  {  	if (global_rt_runtime() == RUNTIME_INF) {  		dl_rq->bw_ratio = 1 << RATIO_SHIFT; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 879d3ccf3806..a562df57a86e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -402,11 +402,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group  	}  	P(se->load.weight); -	P(se->runnable_weight);  #ifdef CONFIG_SMP  	P(se->avg.load_avg);  	P(se->avg.util_avg); -	P(se->avg.runnable_load_avg); +	P(se->avg.runnable_avg);  #endif  #undef PN_SCHEDSTAT @@ -524,11 +523,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)  	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);  	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);  #ifdef CONFIG_SMP -	SEQ_printf(m, "  .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);  	SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",  			cfs_rq->avg.load_avg); -	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg", -			cfs_rq->avg.runnable_load_avg); +	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_avg", +			cfs_rq->avg.runnable_avg);  	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",  			cfs_rq->avg.util_avg);  	SEQ_printf(m, "  .%-30s: %u\n", "util_est_enqueued", @@ -537,8 +535,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)  			cfs_rq->removed.load_avg);  	SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",  			cfs_rq->removed.util_avg); -	SEQ_printf(m, "  .%-30s: %ld\n", "removed.runnable_sum", -			cfs_rq->removed.runnable_sum); +	SEQ_printf(m, "  .%-30s: %ld\n", "removed.runnable_avg", +			cfs_rq->removed.runnable_avg);  #ifdef CONFIG_FAIR_GROUP_SCHED  	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",  			cfs_rq->tg_load_avg_contrib); @@ -818,10 +816,12 @@ static int __init init_sched_debug_procfs(void)  __initcall(init_sched_debug_procfs); -#define __P(F)	SEQ_printf(m, "%-45s:%21Ld\n",	     #F, (long long)F) -#define   P(F)	SEQ_printf(m, "%-45s:%21Ld\n",	     #F, (long long)p->F) -#define __PN(F)	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define   PN(F)	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F)) +#define __P(F) __PS(#F, F) +#define   P(F) __PS(#F, p->F) +#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F))) +#define __PN(F) __PSN(#F, F) +#define   PN(F) __PSN(#F, p->F)  #ifdef CONFIG_NUMA_BALANCING @@ -870,18 +870,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,  	SEQ_printf(m,  		"---------------------------------------------------------"  		"----------\n"); -#define __P(F) \ -	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) \ -	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define P_SCHEDSTAT(F) \ -	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) -#define __PN(F) \ -	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) \ -	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) -#define PN_SCHEDSTAT(F) \ -	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) + +#define P_SCHEDSTAT(F)  __PS(#F, schedstat_val(p->F)) +#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F))  	PN(se.exec_start);  	PN(se.vruntime); @@ -941,24 +932,27 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,  	}  	__P(nr_switches); -	SEQ_printf(m, "%-45s:%21Ld\n", -		   "nr_voluntary_switches", (long long)p->nvcsw); -	SEQ_printf(m, "%-45s:%21Ld\n", -		   "nr_involuntary_switches", (long long)p->nivcsw); +	__PS("nr_voluntary_switches", p->nvcsw); +	__PS("nr_involuntary_switches", p->nivcsw);  	P(se.load.weight); -	P(se.runnable_weight);  #ifdef CONFIG_SMP  	P(se.avg.load_sum); -	P(se.avg.runnable_load_sum); +	P(se.avg.runnable_sum);  	P(se.avg.util_sum);  	P(se.avg.load_avg); -	P(se.avg.runnable_load_avg); +	P(se.avg.runnable_avg);  	P(se.avg.util_avg);  	P(se.avg.last_update_time);  	P(se.avg.util_est.ewma);  	P(se.avg.util_est.enqueued);  #endif +#ifdef CONFIG_UCLAMP_TASK +	__PS("uclamp.min", p->uclamp[UCLAMP_MIN].value); +	__PS("uclamp.max", p->uclamp[UCLAMP_MAX].value); +	__PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); +	__PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); +#endif  	P(policy);  	P(prio);  	if (task_has_dl_policy(p)) { @@ -966,11 +960,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,  		P(dl.deadline);  	}  #undef PN_SCHEDSTAT -#undef PN -#undef __PN  #undef P_SCHEDSTAT -#undef P -#undef __P  	{  		unsigned int this_cpu = raw_smp_processor_id(); @@ -978,8 +968,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,  		t0 = cpu_clock(this_cpu);  		t1 = cpu_clock(this_cpu); -		SEQ_printf(m, "%-45s:%21Ld\n", -			   "clock-delta", (long long)(t1-t0)); +		__PS("clock-delta", t1-t0);  	}  	sched_show_numa(p, m); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c1217bfe5e81..02f323b85b6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -86,6 +86,19 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;  const_debug unsigned int sysctl_sched_migration_cost	= 500000UL; +int sched_thermal_decay_shift; +static int __init setup_sched_thermal_decay_shift(char *str) +{ +	int _shift = 0; + +	if (kstrtoint(str, 0, &_shift)) +		pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); + +	sched_thermal_decay_shift = clamp(_shift, 0, 10); +	return 1; +} +__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); +  #ifdef CONFIG_SMP  /*   * For asym packing, by default the lower numbered CPU has higher priority. @@ -741,9 +754,7 @@ void init_entity_runnable_average(struct sched_entity *se)  	 * nothing has been attached to the task group yet.  	 */  	if (entity_is_task(se)) -		sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); - -	se->runnable_weight = se->load.weight; +		sa->load_avg = scale_load_down(se->load.weight);  	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */  } @@ -796,6 +807,8 @@ void post_init_entity_util_avg(struct task_struct *p)  		}  	} +	sa->runnable_avg = cpu_scale; +  	if (p->sched_class != &fair_sched_class) {  		/*  		 * For !fair tasks do: @@ -1473,36 +1486,51 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,  	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;  } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); - -static unsigned long cpu_runnable_load(struct rq *rq) -{ -	return cfs_rq_runnable_load_avg(&rq->cfs); -} +/* + * 'numa_type' describes the node at the moment of load balancing. + */ +enum numa_type { +	/* The node has spare capacity that can be used to run more tasks.  */ +	node_has_spare = 0, +	/* +	 * The node is fully used and the tasks don't compete for more CPU +	 * cycles. Nevertheless, some tasks might wait before running. +	 */ +	node_fully_busy, +	/* +	 * The node is overloaded and can't provide expected CPU cycles to all +	 * tasks. +	 */ +	node_overloaded +};  /* Cached statistics for all CPUs within a node */  struct numa_stats {  	unsigned long load; - +	unsigned long util;  	/* Total compute capacity of CPUs on a node */  	unsigned long compute_capacity; +	unsigned int nr_running; +	unsigned int weight; +	enum numa_type node_type; +	int idle_cpu;  }; -/* - * XXX borrowed from update_sg_lb_stats - */ -static void update_numa_stats(struct numa_stats *ns, int nid) +static inline bool is_core_idle(int cpu)  { -	int cpu; +#ifdef CONFIG_SCHED_SMT +	int sibling; -	memset(ns, 0, sizeof(*ns)); -	for_each_cpu(cpu, cpumask_of_node(nid)) { -		struct rq *rq = cpu_rq(cpu); +	for_each_cpu(sibling, cpu_smt_mask(cpu)) { +		if (cpu == sibling) +			continue; -		ns->load += cpu_runnable_load(rq); -		ns->compute_capacity += capacity_of(cpu); +		if (!idle_cpu(cpu)) +			return false;  	} +#endif +	return true;  }  struct task_numa_env { @@ -1521,20 +1549,128 @@ struct task_numa_env {  	int best_cpu;  }; +static unsigned long cpu_load(struct rq *rq); +static unsigned long cpu_util(int cpu); +static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); + +static inline enum +numa_type numa_classify(unsigned int imbalance_pct, +			 struct numa_stats *ns) +{ +	if ((ns->nr_running > ns->weight) && +	    ((ns->compute_capacity * 100) < (ns->util * imbalance_pct))) +		return node_overloaded; + +	if ((ns->nr_running < ns->weight) || +	    ((ns->compute_capacity * 100) > (ns->util * imbalance_pct))) +		return node_has_spare; + +	return node_fully_busy; +} + +#ifdef CONFIG_SCHED_SMT +/* Forward declarations of select_idle_sibling helpers */ +static inline bool test_idle_cores(int cpu, bool def); +static inline int numa_idle_core(int idle_core, int cpu) +{ +	if (!static_branch_likely(&sched_smt_present) || +	    idle_core >= 0 || !test_idle_cores(cpu, false)) +		return idle_core; + +	/* +	 * Prefer cores instead of packing HT siblings +	 * and triggering future load balancing. +	 */ +	if (is_core_idle(cpu)) +		idle_core = cpu; + +	return idle_core; +} +#else +static inline int numa_idle_core(int idle_core, int cpu) +{ +	return idle_core; +} +#endif + +/* + * Gather all necessary information to make NUMA balancing placement + * decisions that are compatible with standard load balancer. This + * borrows code and logic from update_sg_lb_stats but sharing a + * common implementation is impractical. + */ +static void update_numa_stats(struct task_numa_env *env, +			      struct numa_stats *ns, int nid, +			      bool find_idle) +{ +	int cpu, idle_core = -1; + +	memset(ns, 0, sizeof(*ns)); +	ns->idle_cpu = -1; + +	rcu_read_lock(); +	for_each_cpu(cpu, cpumask_of_node(nid)) { +		struct rq *rq = cpu_rq(cpu); + +		ns->load += cpu_load(rq); +		ns->util += cpu_util(cpu); +		ns->nr_running += rq->cfs.h_nr_running; +		ns->compute_capacity += capacity_of(cpu); + +		if (find_idle && !rq->nr_running && idle_cpu(cpu)) { +			if (READ_ONCE(rq->numa_migrate_on) || +			    !cpumask_test_cpu(cpu, env->p->cpus_ptr)) +				continue; + +			if (ns->idle_cpu == -1) +				ns->idle_cpu = cpu; + +			idle_core = numa_idle_core(idle_core, cpu); +		} +	} +	rcu_read_unlock(); + +	ns->weight = cpumask_weight(cpumask_of_node(nid)); + +	ns->node_type = numa_classify(env->imbalance_pct, ns); + +	if (idle_core >= 0) +		ns->idle_cpu = idle_core; +} +  static void task_numa_assign(struct task_numa_env *env,  			     struct task_struct *p, long imp)  {  	struct rq *rq = cpu_rq(env->dst_cpu); -	/* Bail out if run-queue part of active NUMA balance. */ -	if (xchg(&rq->numa_migrate_on, 1)) +	/* Check if run-queue part of active NUMA balance. */ +	if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) { +		int cpu; +		int start = env->dst_cpu; + +		/* Find alternative idle CPU. */ +		for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { +			if (cpu == env->best_cpu || !idle_cpu(cpu) || +			    !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { +				continue; +			} + +			env->dst_cpu = cpu; +			rq = cpu_rq(env->dst_cpu); +			if (!xchg(&rq->numa_migrate_on, 1)) +				goto assign; +		} + +		/* Failed to find an alternative idle CPU */  		return; +	} +assign:  	/*  	 * Clear previous best_cpu/rq numa-migrate flag, since task now  	 * found a better CPU to move/swap.  	 */ -	if (env->best_cpu != -1) { +	if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {  		rq = cpu_rq(env->best_cpu);  		WRITE_ONCE(rq->numa_migrate_on, 0);  	} @@ -1590,7 +1726,7 @@ static bool load_too_imbalanced(long src_load, long dst_load,   * into account that it might be best if task running on the dst_cpu should   * be exchanged with the source task   */ -static void task_numa_compare(struct task_numa_env *env, +static bool task_numa_compare(struct task_numa_env *env,  			      long taskimp, long groupimp, bool maymove)  {  	struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); @@ -1601,9 +1737,10 @@ static void task_numa_compare(struct task_numa_env *env,  	int dist = env->dist;  	long moveimp = imp;  	long load; +	bool stopsearch = false;  	if (READ_ONCE(dst_rq->numa_migrate_on)) -		return; +		return false;  	rcu_read_lock();  	cur = rcu_dereference(dst_rq->curr); @@ -1614,8 +1751,10 @@ static void task_numa_compare(struct task_numa_env *env,  	 * Because we have preemption enabled we can get migrated around and  	 * end try selecting ourselves (current == env->p) as a swap candidate.  	 */ -	if (cur == env->p) +	if (cur == env->p) { +		stopsearch = true;  		goto unlock; +	}  	if (!cur) {  		if (maymove && moveimp >= env->best_imp) @@ -1624,18 +1763,27 @@ static void task_numa_compare(struct task_numa_env *env,  			goto unlock;  	} +	/* Skip this swap candidate if cannot move to the source cpu. */ +	if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) +		goto unlock; + +	/* +	 * Skip this swap candidate if it is not moving to its preferred +	 * node and the best task is. +	 */ +	if (env->best_task && +	    env->best_task->numa_preferred_nid == env->src_nid && +	    cur->numa_preferred_nid != env->src_nid) { +		goto unlock; +	} +  	/*  	 * "imp" is the fault differential for the source task between the  	 * source and destination node. Calculate the total differential for  	 * the source task and potential destination task. The more negative  	 * the value is, the more remote accesses that would be expected to  	 * be incurred if the tasks were swapped. -	 */ -	/* Skip this swap candidate if cannot move to the source cpu */ -	if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) -		goto unlock; - -	/* +	 *  	 * If dst and source tasks are in the same NUMA group, or not  	 * in any group then look only at task weights.  	 */ @@ -1662,6 +1810,19 @@ static void task_numa_compare(struct task_numa_env *env,  			       task_weight(cur, env->dst_nid, dist);  	} +	/* Discourage picking a task already on its preferred node */ +	if (cur->numa_preferred_nid == env->dst_nid) +		imp -= imp / 16; + +	/* +	 * Encourage picking a task that moves to its preferred node. +	 * This potentially makes imp larger than it's maximum of +	 * 1998 (see SMALLIMP and task_weight for why) but in this +	 * case, it does not matter. +	 */ +	if (cur->numa_preferred_nid == env->src_nid) +		imp += imp / 8; +  	if (maymove && moveimp > imp && moveimp > env->best_imp) {  		imp = moveimp;  		cur = NULL; @@ -1669,6 +1830,15 @@ static void task_numa_compare(struct task_numa_env *env,  	}  	/* +	 * Prefer swapping with a task moving to its preferred node over a +	 * task that is not. +	 */ +	if (env->best_task && cur->numa_preferred_nid == env->src_nid && +	    env->best_task->numa_preferred_nid != env->src_nid) { +		goto assign; +	} + +	/*  	 * If the NUMA importance is less than SMALLIMP,  	 * task migration might only result in ping pong  	 * of tasks and also hurt performance due to cache @@ -1691,42 +1861,95 @@ static void task_numa_compare(struct task_numa_env *env,  		goto unlock;  assign: -	/* -	 * One idle CPU per node is evaluated for a task numa move. -	 * Call select_idle_sibling to maybe find a better one. -	 */ +	/* Evaluate an idle CPU for a task numa move. */  	if (!cur) { +		int cpu = env->dst_stats.idle_cpu; + +		/* Nothing cached so current CPU went idle since the search. */ +		if (cpu < 0) +			cpu = env->dst_cpu; +  		/* -		 * select_idle_siblings() uses an per-CPU cpumask that -		 * can be used from IRQ context. +		 * If the CPU is no longer truly idle and the previous best CPU +		 * is, keep using it.  		 */ -		local_irq_disable(); -		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, -						   env->dst_cpu); -		local_irq_enable(); +		if (!idle_cpu(cpu) && env->best_cpu >= 0 && +		    idle_cpu(env->best_cpu)) { +			cpu = env->best_cpu; +		} + +		env->dst_cpu = cpu;  	}  	task_numa_assign(env, cur, imp); + +	/* +	 * If a move to idle is allowed because there is capacity or load +	 * balance improves then stop the search. While a better swap +	 * candidate may exist, a search is not free. +	 */ +	if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu)) +		stopsearch = true; + +	/* +	 * If a swap candidate must be identified and the current best task +	 * moves its preferred node then stop the search. +	 */ +	if (!maymove && env->best_task && +	    env->best_task->numa_preferred_nid == env->src_nid) { +		stopsearch = true; +	}  unlock:  	rcu_read_unlock(); + +	return stopsearch;  }  static void task_numa_find_cpu(struct task_numa_env *env,  				long taskimp, long groupimp)  { -	long src_load, dst_load, load;  	bool maymove = false;  	int cpu; -	load = task_h_load(env->p); -	dst_load = env->dst_stats.load + load; -	src_load = env->src_stats.load - load; -  	/* -	 * If the improvement from just moving env->p direction is better -	 * than swapping tasks around, check if a move is possible. +	 * If dst node has spare capacity, then check if there is an +	 * imbalance that would be overruled by the load balancer.  	 */ -	maymove = !load_too_imbalanced(src_load, dst_load, env); +	if (env->dst_stats.node_type == node_has_spare) { +		unsigned int imbalance; +		int src_running, dst_running; + +		/* +		 * Would movement cause an imbalance? Note that if src has +		 * more running tasks that the imbalance is ignored as the +		 * move improves the imbalance from the perspective of the +		 * CPU load balancer. +		 * */ +		src_running = env->src_stats.nr_running - 1; +		dst_running = env->dst_stats.nr_running + 1; +		imbalance = max(0, dst_running - src_running); +		imbalance = adjust_numa_imbalance(imbalance, src_running); + +		/* Use idle CPU if there is no imbalance */ +		if (!imbalance) { +			maymove = true; +			if (env->dst_stats.idle_cpu >= 0) { +				env->dst_cpu = env->dst_stats.idle_cpu; +				task_numa_assign(env, NULL, 0); +				return; +			} +		} +	} else { +		long src_load, dst_load, load; +		/* +		 * If the improvement from just moving env->p direction is better +		 * than swapping tasks around, check if a move is possible. +		 */ +		load = task_h_load(env->p); +		dst_load = env->dst_stats.load + load; +		src_load = env->src_stats.load - load; +		maymove = !load_too_imbalanced(src_load, dst_load, env); +	}  	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {  		/* Skip this CPU if the source task cannot migrate */ @@ -1734,7 +1957,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,  			continue;  		env->dst_cpu = cpu; -		task_numa_compare(env, taskimp, groupimp, maymove); +		if (task_numa_compare(env, taskimp, groupimp, maymove)) +			break;  	}  } @@ -1788,10 +2012,10 @@ static int task_numa_migrate(struct task_struct *p)  	dist = env.dist = node_distance(env.src_nid, env.dst_nid);  	taskweight = task_weight(p, env.src_nid, dist);  	groupweight = group_weight(p, env.src_nid, dist); -	update_numa_stats(&env.src_stats, env.src_nid); +	update_numa_stats(&env, &env.src_stats, env.src_nid, false);  	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;  	groupimp = group_weight(p, env.dst_nid, dist) - groupweight; -	update_numa_stats(&env.dst_stats, env.dst_nid); +	update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);  	/* Try to find a spot on the preferred nid. */  	task_numa_find_cpu(&env, taskimp, groupimp); @@ -1824,7 +2048,7 @@ static int task_numa_migrate(struct task_struct *p)  			env.dist = dist;  			env.dst_nid = nid; -			update_numa_stats(&env.dst_stats, env.dst_nid); +			update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);  			task_numa_find_cpu(&env, taskimp, groupimp);  		}  	} @@ -1848,15 +2072,17 @@ static int task_numa_migrate(struct task_struct *p)  	}  	/* No better CPU than the current one was found. */ -	if (env.best_cpu == -1) +	if (env.best_cpu == -1) { +		trace_sched_stick_numa(p, env.src_cpu, NULL, -1);  		return -EAGAIN; +	}  	best_rq = cpu_rq(env.best_cpu);  	if (env.best_task == NULL) {  		ret = migrate_task_to(p, env.best_cpu);  		WRITE_ONCE(best_rq->numa_migrate_on, 0);  		if (ret != 0) -			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); +			trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);  		return ret;  	} @@ -1864,7 +2090,7 @@ static int task_numa_migrate(struct task_struct *p)  	WRITE_ONCE(best_rq->numa_migrate_on, 0);  	if (ret != 0) -		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); +		trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);  	put_task_struct(env.best_task);  	return ret;  } @@ -2573,7 +2799,7 @@ static void task_numa_work(struct callback_head *work)  		 * Skip inaccessible VMAs to avoid any confusion between  		 * PROT_NONE and NUMA hinting ptes  		 */ -		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) +		if (!vma_is_accessible(vma))  			continue;  		do { @@ -2835,25 +3061,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  #ifdef CONFIG_SMP  static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -	cfs_rq->runnable_weight += se->runnable_weight; - -	cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; -	cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; -} - -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -	cfs_rq->runnable_weight -= se->runnable_weight; - -	sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); -	sub_positive(&cfs_rq->avg.runnable_load_sum, -		     se_runnable(se) * se->avg.runnable_load_sum); -} - -static inline void  enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  	cfs_rq->avg.load_avg += se->avg.load_avg; @@ -2868,28 +3075,22 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)  }  #else  static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void  enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }  static inline void  dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }  #endif  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, -			    unsigned long weight, unsigned long runnable) +			    unsigned long weight)  {  	if (se->on_rq) {  		/* commit outstanding execution time */  		if (cfs_rq->curr == se)  			update_curr(cfs_rq);  		account_entity_dequeue(cfs_rq, se); -		dequeue_runnable_load_avg(cfs_rq, se);  	}  	dequeue_load_avg(cfs_rq, se); -	se->runnable_weight = runnable;  	update_load_set(&se->load, weight);  #ifdef CONFIG_SMP @@ -2897,16 +3098,13 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,  		u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;  		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); -		se->avg.runnable_load_avg = -			div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);  	} while (0);  #endif  	enqueue_load_avg(cfs_rq, se); -	if (se->on_rq) { +	if (se->on_rq)  		account_entity_enqueue(cfs_rq, se); -		enqueue_runnable_load_avg(cfs_rq, se); -	} +  }  void reweight_task(struct task_struct *p, int prio) @@ -2916,7 +3114,7 @@ void reweight_task(struct task_struct *p, int prio)  	struct load_weight *load = &se->load;  	unsigned long weight = scale_load(sched_prio_to_weight[prio]); -	reweight_entity(cfs_rq, se, weight, weight); +	reweight_entity(cfs_rq, se, weight);  	load->inv_weight = sched_prio_to_wmult[prio];  } @@ -3028,50 +3226,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)  	 */  	return clamp_t(long, shares, MIN_SHARES, tg_shares);  } - -/* - * This calculates the effective runnable weight for a group entity based on - * the group entity weight calculated above. - * - * Because of the above approximation (2), our group entity weight is - * an load_avg based ratio (3). This means that it includes blocked load and - * does not represent the runnable weight. - * - * Approximate the group entity's runnable weight per ratio from the group - * runqueue: - * - *					     grq->avg.runnable_load_avg - *   ge->runnable_weight = ge->load.weight * -------------------------- (7) - *						 grq->avg.load_avg - * - * However, analogous to above, since the avg numbers are slow, this leads to - * transients in the from-idle case. Instead we use: - * - *   ge->runnable_weight = ge->load.weight * - * - *		max(grq->avg.runnable_load_avg, grq->runnable_weight) - *		-----------------------------------------------------	(8) - *		      max(grq->avg.load_avg, grq->load.weight) - * - * Where these max() serve both to use the 'instant' values to fix the slow - * from-idle and avoid the /0 on to-idle, similar to (6). - */ -static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) -{ -	long runnable, load_avg; - -	load_avg = max(cfs_rq->avg.load_avg, -		       scale_load_down(cfs_rq->load.weight)); - -	runnable = max(cfs_rq->avg.runnable_load_avg, -		       scale_load_down(cfs_rq->runnable_weight)); - -	runnable *= shares; -	if (load_avg) -		runnable /= load_avg; - -	return clamp_t(long, runnable, MIN_SHARES, shares); -}  #endif /* CONFIG_SMP */  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -3083,7 +3237,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);  static void update_cfs_group(struct sched_entity *se)  {  	struct cfs_rq *gcfs_rq = group_cfs_rq(se); -	long shares, runnable; +	long shares;  	if (!gcfs_rq)  		return; @@ -3092,16 +3246,15 @@ static void update_cfs_group(struct sched_entity *se)  		return;  #ifndef CONFIG_SMP -	runnable = shares = READ_ONCE(gcfs_rq->tg->shares); +	shares = READ_ONCE(gcfs_rq->tg->shares);  	if (likely(se->load.weight == shares))  		return;  #else  	shares   = calc_group_shares(gcfs_rq); -	runnable = calc_group_runnable(gcfs_rq, shares);  #endif -	reweight_entity(cfs_rq_of(se), se, shares, runnable); +	reweight_entity(cfs_rq_of(se), se, shares);  }  #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -3226,11 +3379,11 @@ void set_task_rq_fair(struct sched_entity *se,   * _IFF_ we look at the pure running and runnable sums. Because they   * represent the very same entity, just at different points in the hierarchy.   * - * Per the above update_tg_cfs_util() is trivial and simply copies the running - * sum over (but still wrong, because the group entity and group rq do not have - * their PELT windows aligned). + * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial + * and simply copies the running/runnable sum over (but still wrong, because + * the group entity and group rq do not have their PELT windows aligned).   * - * However, update_tg_cfs_runnable() is more complex. So we have: + * However, update_tg_cfs_load() is more complex. So we have:   *   *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg		(2)   * @@ -3313,9 +3466,35 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq  static inline void  update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)  { +	long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + +	/* Nothing to update */ +	if (!delta) +		return; + +	/* +	 * The relation between sum and avg is: +	 * +	 *   LOAD_AVG_MAX - 1024 + sa->period_contrib +	 * +	 * however, the PELT windows are not aligned between grq and gse. +	 */ + +	/* Set new sched_entity's runnable */ +	se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; +	se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX; + +	/* Update parent cfs_rq runnable */ +	add_positive(&cfs_rq->avg.runnable_avg, delta); +	cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX; +} + +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) +{  	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; -	unsigned long runnable_load_avg, load_avg; -	u64 runnable_load_sum, load_sum = 0; +	unsigned long load_avg; +	u64 load_sum = 0;  	s64 delta_sum;  	if (!runnable_sum) @@ -3363,20 +3542,6 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf  	se->avg.load_avg = load_avg;  	add_positive(&cfs_rq->avg.load_avg, delta_avg);  	add_positive(&cfs_rq->avg.load_sum, delta_sum); - -	runnable_load_sum = (s64)se_runnable(se) * runnable_sum; -	runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - -	if (se->on_rq) { -		delta_sum = runnable_load_sum - -				se_weight(se) * se->avg.runnable_load_sum; -		delta_avg = runnable_load_avg - se->avg.runnable_load_avg; -		add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); -		add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); -	} - -	se->avg.runnable_load_sum = runnable_sum; -	se->avg.runnable_load_avg = runnable_load_avg;  }  static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3405,6 +3570,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)  	update_tg_cfs_util(cfs_rq, se, gcfs_rq);  	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); +	update_tg_cfs_load(cfs_rq, se, gcfs_rq);  	trace_pelt_cfs_tp(cfs_rq);  	trace_pelt_se_tp(se); @@ -3474,7 +3640,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum  static inline int  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)  { -	unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; +	unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;  	struct sched_avg *sa = &cfs_rq->avg;  	int decayed = 0; @@ -3485,7 +3651,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)  		raw_spin_lock(&cfs_rq->removed.lock);  		swap(cfs_rq->removed.util_avg, removed_util);  		swap(cfs_rq->removed.load_avg, removed_load); -		swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); +		swap(cfs_rq->removed.runnable_avg, removed_runnable);  		cfs_rq->removed.nr = 0;  		raw_spin_unlock(&cfs_rq->removed.lock); @@ -3497,7 +3663,16 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)  		sub_positive(&sa->util_avg, r);  		sub_positive(&sa->util_sum, r * divider); -		add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); +		r = removed_runnable; +		sub_positive(&sa->runnable_avg, r); +		sub_positive(&sa->runnable_sum, r * divider); + +		/* +		 * removed_runnable is the unweighted version of removed_load so we +		 * can use it to estimate removed_load_sum. +		 */ +		add_tg_cfs_propagate(cfs_rq, +			-(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);  		decayed = 1;  	} @@ -3542,17 +3717,19 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s  	 */  	se->avg.util_sum = se->avg.util_avg * divider; +	se->avg.runnable_sum = se->avg.runnable_avg * divider; +  	se->avg.load_sum = divider;  	if (se_weight(se)) {  		se->avg.load_sum =  			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));  	} -	se->avg.runnable_load_sum = se->avg.load_sum; -  	enqueue_load_avg(cfs_rq, se);  	cfs_rq->avg.util_avg += se->avg.util_avg;  	cfs_rq->avg.util_sum += se->avg.util_sum; +	cfs_rq->avg.runnable_avg += se->avg.runnable_avg; +	cfs_rq->avg.runnable_sum += se->avg.runnable_sum;  	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); @@ -3574,6 +3751,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s  	dequeue_load_avg(cfs_rq, se);  	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);  	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); +	sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); +	sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);  	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -3680,13 +3859,13 @@ static void remove_entity_load_avg(struct sched_entity *se)  	++cfs_rq->removed.nr;  	cfs_rq->removed.util_avg	+= se->avg.util_avg;  	cfs_rq->removed.load_avg	+= se->avg.load_avg; -	cfs_rq->removed.runnable_sum	+= se->avg.load_sum; /* == runnable_sum */ +	cfs_rq->removed.runnable_avg	+= se->avg.runnable_avg;  	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);  } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)  { -	return cfs_rq->avg.runnable_load_avg; +	return cfs_rq->avg.runnable_avg;  }  static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) @@ -3957,6 +4136,7 @@ static inline void check_schedstat_required(void)  #endif  } +static inline bool cfs_bandwidth_used(void);  /*   * MIGRATION @@ -4021,8 +4201,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	 *   - Add its new weight to cfs_rq->load.weight  	 */  	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); +	se_update_runnable(se);  	update_cfs_group(se); -	enqueue_runnable_load_avg(cfs_rq, se);  	account_entity_enqueue(cfs_rq, se);  	if (flags & ENQUEUE_WAKEUP) @@ -4035,10 +4215,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  		__enqueue_entity(cfs_rq, se);  	se->on_rq = 1; -	if (cfs_rq->nr_running == 1) { +	/* +	 * When bandwidth control is enabled, cfs might have been removed +	 * because of a parent been throttled but cfs->nr_running > 1. Try to +	 * add it unconditionnally. +	 */ +	if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())  		list_add_leaf_cfs_rq(cfs_rq); + +	if (cfs_rq->nr_running == 1)  		check_enqueue_throttle(cfs_rq); -	}  }  static void __clear_buddies_last(struct sched_entity *se) @@ -4105,7 +4291,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	 *     of its group cfs_rq.  	 */  	update_load_avg(cfs_rq, se, UPDATE_TG); -	dequeue_runnable_load_avg(cfs_rq, se); +	se_update_runnable(se);  	update_stats_dequeue(cfs_rq, se, flags); @@ -4541,8 +4727,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  		if (!se->on_rq)  			break; -		if (dequeue) +		if (dequeue) {  			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); +		} else { +			update_load_avg(qcfs_rq, se, 0); +			se_update_runnable(se); +		} +  		qcfs_rq->h_nr_running -= task_delta;  		qcfs_rq->idle_h_nr_running -= idle_task_delta; @@ -4610,8 +4801,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  			enqueue = 0;  		cfs_rq = cfs_rq_of(se); -		if (enqueue) +		if (enqueue) {  			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); +		} else { +			update_load_avg(cfs_rq, se, 0); +			se_update_runnable(se); +		} +  		cfs_rq->h_nr_running += task_delta;  		cfs_rq->idle_h_nr_running += idle_task_delta; @@ -4619,21 +4815,31 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  			break;  	} -	assert_list_leaf_cfs_rq(rq); -  	if (!se)  		add_nr_running(rq, task_delta); +	/* +	 * The cfs_rq_throttled() breaks in the above iteration can result in +	 * incomplete leaf list maintenance, resulting in triggering the +	 * assertion below. +	 */ +	for_each_sched_entity(se) { +		cfs_rq = cfs_rq_of(se); + +		list_add_leaf_cfs_rq(cfs_rq); +	} + +	assert_list_leaf_cfs_rq(rq); +  	/* Determine whether we need to wake up potentially idle CPU: */  	if (rq->curr == rq->idle && rq->cfs.nr_running)  		resched_curr(rq);  } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) +static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)  {  	struct cfs_rq *cfs_rq; -	u64 runtime; -	u64 starting_runtime = remaining; +	u64 runtime, remaining = 1;  	rcu_read_lock();  	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -4648,10 +4854,13 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)  		/* By the above check, this should never be true */  		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); +		raw_spin_lock(&cfs_b->lock);  		runtime = -cfs_rq->runtime_remaining + 1; -		if (runtime > remaining) -			runtime = remaining; -		remaining -= runtime; +		if (runtime > cfs_b->runtime) +			runtime = cfs_b->runtime; +		cfs_b->runtime -= runtime; +		remaining = cfs_b->runtime; +		raw_spin_unlock(&cfs_b->lock);  		cfs_rq->runtime_remaining += runtime; @@ -4666,8 +4875,6 @@ next:  			break;  	}  	rcu_read_unlock(); - -	return starting_runtime - remaining;  }  /* @@ -4678,7 +4885,6 @@ next:   */  static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)  { -	u64 runtime;  	int throttled;  	/* no need to continue the timer with no bandwidth constraint */ @@ -4707,24 +4913,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u  	cfs_b->nr_throttled += overrun;  	/* -	 * This check is repeated as we are holding onto the new bandwidth while -	 * we unthrottle. This can potentially race with an unthrottled group -	 * trying to acquire new bandwidth from the global pool. This can result -	 * in us over-using our runtime if it is all used during this loop, but -	 * only by limited amounts in that extreme case. +	 * This check is repeated as we release cfs_b->lock while we unthrottle.  	 */  	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { -		runtime = cfs_b->runtime;  		cfs_b->distribute_running = 1;  		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);  		/* we can't nest cfs_b->lock while distributing bandwidth */ -		runtime = distribute_cfs_runtime(cfs_b, runtime); +		distribute_cfs_runtime(cfs_b);  		raw_spin_lock_irqsave(&cfs_b->lock, flags);  		cfs_b->distribute_running = 0;  		throttled = !list_empty(&cfs_b->throttled_cfs_rq); - -		lsub_positive(&cfs_b->runtime, runtime);  	}  	/* @@ -4858,10 +5057,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  	if (!runtime)  		return; -	runtime = distribute_cfs_runtime(cfs_b, runtime); +	distribute_cfs_runtime(cfs_b);  	raw_spin_lock_irqsave(&cfs_b->lock, flags); -	lsub_positive(&cfs_b->runtime, runtime);  	cfs_b->distribute_running = 0;  	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);  } @@ -5258,32 +5456,32 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		cfs_rq = cfs_rq_of(se);  		enqueue_entity(cfs_rq, se, flags); -		/* -		 * end evaluation on encountering a throttled cfs_rq -		 * -		 * note: in the case of encountering a throttled cfs_rq we will -		 * post the final h_nr_running increment below. -		 */ -		if (cfs_rq_throttled(cfs_rq)) -			break;  		cfs_rq->h_nr_running++;  		cfs_rq->idle_h_nr_running += idle_h_nr_running; +		/* end evaluation on encountering a throttled cfs_rq */ +		if (cfs_rq_throttled(cfs_rq)) +			goto enqueue_throttle; +  		flags = ENQUEUE_WAKEUP;  	}  	for_each_sched_entity(se) {  		cfs_rq = cfs_rq_of(se); + +		update_load_avg(cfs_rq, se, UPDATE_TG); +		se_update_runnable(se); +		update_cfs_group(se); +  		cfs_rq->h_nr_running++;  		cfs_rq->idle_h_nr_running += idle_h_nr_running; +		/* end evaluation on encountering a throttled cfs_rq */  		if (cfs_rq_throttled(cfs_rq)) -			break; - -		update_load_avg(cfs_rq, se, UPDATE_TG); -		update_cfs_group(se); +			goto enqueue_throttle;  	} +enqueue_throttle:  	if (!se) {  		add_nr_running(rq, 1);  		/* @@ -5344,17 +5542,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		cfs_rq = cfs_rq_of(se);  		dequeue_entity(cfs_rq, se, flags); -		/* -		 * end evaluation on encountering a throttled cfs_rq -		 * -		 * note: in the case of encountering a throttled cfs_rq we will -		 * post the final h_nr_running decrement below. -		*/ -		if (cfs_rq_throttled(cfs_rq)) -			break;  		cfs_rq->h_nr_running--;  		cfs_rq->idle_h_nr_running -= idle_h_nr_running; +		/* end evaluation on encountering a throttled cfs_rq */ +		if (cfs_rq_throttled(cfs_rq)) +			goto dequeue_throttle; +  		/* Don't dequeue parent if it has other entities besides us */  		if (cfs_rq->load.weight) {  			/* Avoid re-evaluating load for this entity: */ @@ -5372,16 +5566,21 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	for_each_sched_entity(se) {  		cfs_rq = cfs_rq_of(se); + +		update_load_avg(cfs_rq, se, UPDATE_TG); +		se_update_runnable(se); +		update_cfs_group(se); +  		cfs_rq->h_nr_running--;  		cfs_rq->idle_h_nr_running -= idle_h_nr_running; +		/* end evaluation on encountering a throttled cfs_rq */  		if (cfs_rq_throttled(cfs_rq)) -			break; +			goto dequeue_throttle; -		update_load_avg(cfs_rq, se, UPDATE_TG); -		update_cfs_group(se);  	} +dequeue_throttle:  	if (!se)  		sub_nr_running(rq, 1); @@ -5447,6 +5646,29 @@ static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)  	return load;  } +static unsigned long cpu_runnable(struct rq *rq) +{ +	return cfs_rq_runnable_avg(&rq->cfs); +} + +static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p) +{ +	struct cfs_rq *cfs_rq; +	unsigned int runnable; + +	/* Task has no contribution or is new */ +	if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) +		return cpu_runnable(rq); + +	cfs_rq = &rq->cfs; +	runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + +	/* Discount task's runnable from CPU's runnable */ +	lsub_positive(&runnable, p->se.avg.runnable_avg); + +	return runnable; +} +  static unsigned long capacity_of(int cpu)  {  	return cpu_rq(cpu)->cpu_capacity; @@ -5786,10 +6008,12 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int  		bool idle = true;  		for_each_cpu(cpu, cpu_smt_mask(core)) { -			__cpumask_clear_cpu(cpu, cpus); -			if (!available_idle_cpu(cpu)) +			if (!available_idle_cpu(cpu)) {  				idle = false; +				break; +			}  		} +		cpumask_andnot(cpus, cpus, cpu_smt_mask(core));  		if (idle)  			return core; @@ -5847,8 +6071,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t  	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);  	struct sched_domain *this_sd;  	u64 avg_cost, avg_idle; -	u64 time, cost; -	s64 delta; +	u64 time;  	int this = smp_processor_id();  	int cpu, nr = INT_MAX; @@ -5886,14 +6109,46 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t  	}  	time = cpu_clock(this) - time; -	cost = this_sd->avg_scan_cost; -	delta = (s64)(time - cost) / 8; -	this_sd->avg_scan_cost += delta; +	update_avg(&this_sd->avg_scan_cost, time);  	return cpu;  }  /* + * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which + * the task fits. If no CPU is big enough, but there are idle ones, try to + * maximize capacity. + */ +static int +select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) +{ +	unsigned long best_cap = 0; +	int cpu, best_cpu = -1; +	struct cpumask *cpus; + +	sync_entity_load_avg(&p->se); + +	cpus = this_cpu_cpumask_var_ptr(select_idle_mask); +	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + +	for_each_cpu_wrap(cpu, cpus, target) { +		unsigned long cpu_cap = capacity_of(cpu); + +		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) +			continue; +		if (task_fits_capacity(p, cpu_cap)) +			return cpu; + +		if (cpu_cap > best_cap) { +			best_cap = cpu_cap; +			best_cpu = cpu; +		} +	} + +	return best_cpu; +} + +/*   * Try and locate an idle core/thread in the LLC cache domain.   */  static int select_idle_sibling(struct task_struct *p, int prev, int target) @@ -5901,6 +6156,28 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  	struct sched_domain *sd;  	int i, recent_used_cpu; +	/* +	 * For asymmetric CPU capacity systems, our domain of interest is +	 * sd_asym_cpucapacity rather than sd_llc. +	 */ +	if (static_branch_unlikely(&sched_asym_cpucapacity)) { +		sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); +		/* +		 * On an asymmetric CPU capacity system where an exclusive +		 * cpuset defines a symmetric island (i.e. one unique +		 * capacity_orig value through the cpuset), the key will be set +		 * but the CPUs within that cpuset will not have a domain with +		 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric +		 * capacity path. +		 */ +		if (!sd) +			goto symmetric; + +		i = select_idle_capacity(p, sd, target); +		return ((unsigned)i < nr_cpumask_bits) ? i : target; +	} + +symmetric:  	if (available_idle_cpu(target) || sched_idle_cpu(target))  		return target; @@ -6101,33 +6378,6 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)  }  /* - * Disable WAKE_AFFINE in the case where task @p doesn't fit in the - * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. - * - * In that case WAKE_AFFINE doesn't make sense and we'll let - * BALANCE_WAKE sort things out. - */ -static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) -{ -	long min_cap, max_cap; - -	if (!static_branch_unlikely(&sched_asym_cpucapacity)) -		return 0; - -	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); -	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; - -	/* Minimum capacity is close to max, no need to abort wake_affine */ -	if (max_cap - min_cap < max_cap >> 3) -		return 0; - -	/* Bring task utilization in sync with prev_cpu */ -	sync_entity_load_avg(&p->se); - -	return !task_fits_capacity(p, min_cap); -} - -/*   * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)   * to @dst_cpu.   */ @@ -6391,8 +6641,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f  			new_cpu = prev_cpu;  		} -		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && -			      cpumask_test_cpu(cpu, p->cpus_ptr); +		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);  	}  	rcu_read_lock(); @@ -7506,6 +7755,9 @@ static inline bool others_have_blocked(struct rq *rq)  	if (READ_ONCE(rq->avg_dl.util_avg))  		return true; +	if (thermal_load_avg(rq)) +		return true; +  #ifdef CONFIG_HAVE_SCHED_AVG_IRQ  	if (READ_ONCE(rq->avg_irq.util_avg))  		return true; @@ -7531,6 +7783,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)  {  	const struct sched_class *curr_class;  	u64 now = rq_clock_pelt(rq); +	unsigned long thermal_pressure;  	bool decayed;  	/* @@ -7539,8 +7792,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done)  	 */  	curr_class = rq->curr->sched_class; +	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); +  	decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |  		  update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | +		  update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |  		  update_irq_load_avg(rq, 0);  	if (others_have_blocked(rq)) @@ -7562,7 +7818,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)  	if (cfs_rq->avg.util_sum)  		return false; -	if (cfs_rq->avg.runnable_load_sum) +	if (cfs_rq->avg.runnable_sum)  		return false;  	return true; @@ -7700,7 +7956,8 @@ struct sg_lb_stats {  	unsigned long avg_load; /*Avg load across the CPUs of the group */  	unsigned long group_load; /* Total load over the CPUs of the group */  	unsigned long group_capacity; -	unsigned long group_util; /* Total utilization of the group */ +	unsigned long group_util; /* Total utilization over the CPUs of the group */ +	unsigned long group_runnable; /* Total runnable time over the CPUs of the group */  	unsigned int sum_nr_running; /* Nr of tasks running in the group */  	unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */  	unsigned int idle_cpus; @@ -7763,8 +8020,15 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)  	if (unlikely(irq >= max))  		return 1; +	/* +	 * avg_rt.util_avg and avg_dl.util_avg track binary signals +	 * (running and not running) with weights 0 and 1024 respectively. +	 * avg_thermal.load_avg tracks thermal pressure and the weighted +	 * average uses the actual delta max capacity(load). +	 */  	used = READ_ONCE(rq->avg_rt.util_avg);  	used += READ_ONCE(rq->avg_dl.util_avg); +	used += thermal_load_avg(rq);  	if (unlikely(used >= max))  		return 1; @@ -7921,6 +8185,10 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)  	if (sgs->sum_nr_running < sgs->group_weight)  		return true; +	if ((sgs->group_capacity * imbalance_pct) < +			(sgs->group_runnable * 100)) +		return false; +  	if ((sgs->group_capacity * 100) >  			(sgs->group_util * imbalance_pct))  		return true; @@ -7946,6 +8214,10 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)  			(sgs->group_util * imbalance_pct))  		return true; +	if ((sgs->group_capacity * imbalance_pct) < +			(sgs->group_runnable * 100)) +		return true; +  	return false;  } @@ -8040,6 +8312,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,  		sgs->group_load += cpu_load(rq);  		sgs->group_util += cpu_util(i); +		sgs->group_runnable += cpu_runnable(rq);  		sgs->sum_h_nr_running += rq->cfs.h_nr_running;  		nr_running = rq->nr_running; @@ -8315,6 +8588,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,  		sgs->group_load += cpu_load_without(rq, p);  		sgs->group_util += cpu_util_without(i, p); +		sgs->group_runnable += cpu_runnable_without(rq, p);  		local = task_running_on_cpu(i, p);  		sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; @@ -8345,7 +8619,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,  	 * Computing avg_load makes sense only when group is fully busy or  	 * overloaded  	 */ -	if (sgs->group_type < group_fully_busy) +	if (sgs->group_type == group_fully_busy || +		sgs->group_type == group_overloaded)  		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /  				sgs->group_capacity;  } @@ -8628,6 +8903,21 @@ next_group:  	}  } +static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) +{ +	unsigned int imbalance_min; + +	/* +	 * Allow a small imbalance based on a simple pair of communicating +	 * tasks that remain local when the source domain is almost idle. +	 */ +	imbalance_min = 2; +	if (src_nr_running <= imbalance_min) +		return 0; + +	return imbalance; +} +  /**   * calculate_imbalance - Calculate the amount of imbalance present within the   *			 groups of a given sched_domain during load balance. @@ -8724,24 +9014,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s  		}  		/* Consider allowing a small imbalance between NUMA groups */ -		if (env->sd->flags & SD_NUMA) { -			unsigned int imbalance_min; - -			/* -			 * Compute an allowed imbalance based on a simple -			 * pair of communicating tasks that should remain -			 * local and ignore them. -			 * -			 * NOTE: Generally this would have been based on -			 * the domain size and this was evaluated. However, -			 * the benefit is similar across a range of workloads -			 * and machines but scaling by the domain size adds -			 * the risk that lower domains have to be rebalanced. -			 */ -			imbalance_min = 2; -			if (busiest->sum_nr_running <= imbalance_min) -				env->imbalance = 0; -		} +		if (env->sd->flags & SD_NUMA) +			env->imbalance = adjust_numa_imbalance(env->imbalance, +						busiest->sum_nr_running);  		return;  	} @@ -8761,6 +9036,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s  		sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /  				sds->total_capacity; +		/* +		 * If the local group is more loaded than the selected +		 * busiest group don't try to pull any tasks. +		 */ +		if (local->avg_load >= busiest->avg_load) { +			env->imbalance = 0; +			return; +		}  	}  	/* @@ -9027,6 +9310,14 @@ static struct rq *find_busiest_queue(struct lb_env *env,  		case migrate_util:  			util = cpu_util(cpu_of(rq)); +			/* +			 * Don't try to pull utilization from a CPU with one +			 * running task. Whatever its utilization, we will fail +			 * detach the task. +			 */ +			if (nr_running <= 1) +				continue; +  			if (busiest_util < util) {  				busiest_util = util;  				busiest = rq; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 008d6ac2342b..808244f3ddd9 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -149,6 +149,9 @@ __setup("nohz_full=", housekeeping_nohz_full_setup);  static int __init housekeeping_isolcpus_setup(char *str)  {  	unsigned int flags = 0; +	bool illegal = false; +	char *par; +	int len;  	while (isalpha(*str)) {  		if (!strncmp(str, "nohz,", 5)) { @@ -169,8 +172,22 @@ static int __init housekeeping_isolcpus_setup(char *str)  			continue;  		} -		pr_warn("isolcpus: Error, unknown flag\n"); -		return 0; +		/* +		 * Skip unknown sub-parameter and validate that it is not +		 * containing an invalid character. +		 */ +		for (par = str, len = 0; *str && *str != ','; str++, len++) { +			if (!isalpha(*str) && *str != '_') +				illegal = true; +		} + +		if (illegal) { +			pr_warn("isolcpus: Invalid flag %.*s\n", len, par); +			return 0; +		} + +		pr_info("isolcpus: Skipped unknown flag %.*s\n", len, par); +		str++;  	}  	/* Default behaviour for isolcpus without flags */ diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index bd006b79b360..b647d04d9c8b 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -121,8 +121,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa,  	 */  	if (periods) {  		sa->load_sum = decay_load(sa->load_sum, periods); -		sa->runnable_load_sum = -			decay_load(sa->runnable_load_sum, periods); +		sa->runnable_sum = +			decay_load(sa->runnable_sum, periods);  		sa->util_sum = decay_load((u64)(sa->util_sum), periods);  		/* @@ -149,7 +149,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,  	if (load)  		sa->load_sum += load * contrib;  	if (runnable) -		sa->runnable_load_sum += runnable * contrib; +		sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;  	if (running)  		sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; @@ -238,7 +238,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,  }  static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) +___update_load_avg(struct sched_avg *sa, unsigned long load)  {  	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; @@ -246,7 +246,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna  	 * Step 2: update *_avg.  	 */  	sa->load_avg = div_u64(load * sa->load_sum, divider); -	sa->runnable_load_avg =	div_u64(runnable * sa->runnable_load_sum, divider); +	sa->runnable_avg = div_u64(sa->runnable_sum, divider);  	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);  } @@ -254,33 +254,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna   * sched_entity:   *   *   task: - *     se_runnable() == se_weight() + *     se_weight()   = se->load.weight + *     se_runnable() = !!on_rq   *   *   group: [ see update_cfs_group() ]   *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg - *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + *     se_runnable() = grq->h_nr_running   * - *   load_sum := runnable_sum - *   load_avg = se_weight(se) * runnable_avg + *   runnable_sum = se_runnable() * runnable = grq->runnable_sum + *   runnable_avg = runnable_sum   * - *   runnable_load_sum := runnable_sum - *   runnable_load_avg = se_runnable(se) * runnable_avg - * - * XXX collapse load_sum and runnable_load_sum + *   load_sum := runnable + *   load_avg = se_weight(se) * load_sum   *   * cfq_rq:   * + *   runnable_sum = \Sum se->avg.runnable_sum + *   runnable_avg = \Sum se->avg.runnable_avg + *   *   load_sum = \Sum se_weight(se) * se->avg.load_sum   *   load_avg = \Sum se->avg.load_avg - * - *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum - *   runnable_load_avg = \Sum se->avg.runable_load_avg   */  int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)  {  	if (___update_load_sum(now, &se->avg, 0, 0, 0)) { -		___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); +		___update_load_avg(&se->avg, se_weight(se));  		trace_pelt_se_tp(se);  		return 1;  	} @@ -290,10 +289,10 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)  int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)  { -	if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, +	if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),  				cfs_rq->curr == se)) { -		___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); +		___update_load_avg(&se->avg, se_weight(se));  		cfs_se_util_change(&se->avg);  		trace_pelt_se_tp(se);  		return 1; @@ -306,10 +305,10 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)  {  	if (___update_load_sum(now, &cfs_rq->avg,  				scale_load_down(cfs_rq->load.weight), -				scale_load_down(cfs_rq->runnable_weight), +				cfs_rq->h_nr_running,  				cfs_rq->curr != NULL)) { -		___update_load_avg(&cfs_rq->avg, 1, 1); +		___update_load_avg(&cfs_rq->avg, 1);  		trace_pelt_cfs_tp(cfs_rq);  		return 1;  	} @@ -322,9 +321,9 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)   *   *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked   *   util_sum = cpu_scale * load_sum - *   runnable_load_sum = load_sum + *   runnable_sum = util_sum   * - *   load_avg and runnable_load_avg are not supported and meaningless. + *   load_avg and runnable_avg are not supported and meaningless.   *   */ @@ -335,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)  				running,  				running)) { -		___update_load_avg(&rq->avg_rt, 1, 1); +		___update_load_avg(&rq->avg_rt, 1);  		trace_pelt_rt_tp(rq);  		return 1;  	} @@ -348,7 +347,9 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)   *   *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked   *   util_sum = cpu_scale * load_sum - *   runnable_load_sum = load_sum + *   runnable_sum = util_sum + * + *   load_avg and runnable_avg are not supported and meaningless.   *   */ @@ -359,7 +360,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)  				running,  				running)) { -		___update_load_avg(&rq->avg_dl, 1, 1); +		___update_load_avg(&rq->avg_dl, 1);  		trace_pelt_dl_tp(rq);  		return 1;  	} @@ -367,13 +368,46 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)  	return 0;  } +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +/* + * thermal: + * + *   load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked + * + *   util_avg and runnable_load_avg are not supported and meaningless. + * + * Unlike rt/dl utilization tracking that track time spent by a cpu + * running a rt/dl task through util_avg, the average thermal pressure is + * tracked through load_avg. This is because thermal pressure signal is + * time weighted "delta" capacity unlike util_avg which is binary. + * "delta capacity" =  actual capacity  - + *			capped capacity a cpu due to a thermal event. + */ + +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ +	if (___update_load_sum(now, &rq->avg_thermal, +			       capacity, +			       capacity, +			       capacity)) { +		___update_load_avg(&rq->avg_thermal, 1); +		trace_pelt_thermal_tp(rq); +		return 1; +	} + +	return 0; +} +#endif +  #ifdef CONFIG_HAVE_SCHED_AVG_IRQ  /*   * irq:   *   *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked   *   util_sum = cpu_scale * load_sum - *   runnable_load_sum = load_sum + *   runnable_sum = util_sum + * + *   load_avg and runnable_avg are not supported and meaningless.   *   */ @@ -410,7 +444,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)  				1);  	if (ret) { -		___update_load_avg(&rq->avg_irq, 1, 1); +		___update_load_avg(&rq->avg_irq, 1);  		trace_pelt_irq_tp(rq);  	} diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index afff644da065..eb034d9f024d 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -7,6 +7,26 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);  int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);  int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); + +static inline u64 thermal_load_avg(struct rq *rq) +{ +	return READ_ONCE(rq->avg_thermal.load_avg); +} +#else +static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ +	return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ +	return 0; +} +#endif +  #ifdef CONFIG_HAVE_SCHED_AVG_IRQ  int update_irq_load_avg(struct rq *rq, u64 running);  #else @@ -159,6 +179,17 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)  }  static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ +	return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ +	return 0; +} + +static inline int  update_irq_load_avg(struct rq *rq, u64 running)  {  	return 0; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 028520702717..8f45cdb6463b 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state)  	case PSI_MEM_FULL:  		return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];  	case PSI_CPU_SOME: -		return tasks[NR_RUNNING] > 1; +		return tasks[NR_RUNNING] > tasks[NR_ONCPU];  	case PSI_NONIDLE:  		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||  			tasks[NR_RUNNING]; @@ -669,13 +669,14 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,  		groupc->times[PSI_NONIDLE] += delta;  } -static u32 psi_group_change(struct psi_group *group, int cpu, -			    unsigned int clear, unsigned int set) +static void psi_group_change(struct psi_group *group, int cpu, +			     unsigned int clear, unsigned int set, +			     bool wake_clock)  {  	struct psi_group_cpu *groupc; +	u32 state_mask = 0;  	unsigned int t, m;  	enum psi_states s; -	u32 state_mask = 0;  	groupc = per_cpu_ptr(group->pcpu, cpu); @@ -695,10 +696,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu,  		if (!(m & (1 << t)))  			continue;  		if (groupc->tasks[t] == 0 && !psi_bug) { -			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", +			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",  					cpu, t, groupc->tasks[0],  					groupc->tasks[1], groupc->tasks[2], -					clear, set); +					groupc->tasks[3], clear, set);  			psi_bug = 1;  		}  		groupc->tasks[t]--; @@ -717,7 +718,11 @@ static u32 psi_group_change(struct psi_group *group, int cpu,  	write_seqcount_end(&groupc->seq); -	return state_mask; +	if (state_mask & group->poll_states) +		psi_schedule_poll_work(group, 1); + +	if (wake_clock && !delayed_work_pending(&group->avgs_work)) +		schedule_delayed_work(&group->avgs_work, PSI_FREQ);  }  static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -744,27 +749,32 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)  	return &psi_system;  } -void psi_task_change(struct task_struct *task, int clear, int set) +static void psi_flags_change(struct task_struct *task, int clear, int set)  { -	int cpu = task_cpu(task); -	struct psi_group *group; -	bool wake_clock = true; -	void *iter = NULL; - -	if (!task->pid) -		return; -  	if (((task->psi_flags & set) ||  	     (task->psi_flags & clear) != clear) &&  	    !psi_bug) {  		printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", -				task->pid, task->comm, cpu, +				task->pid, task->comm, task_cpu(task),  				task->psi_flags, clear, set);  		psi_bug = 1;  	}  	task->psi_flags &= ~clear;  	task->psi_flags |= set; +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ +	int cpu = task_cpu(task); +	struct psi_group *group; +	bool wake_clock = true; +	void *iter = NULL; + +	if (!task->pid) +		return; + +	psi_flags_change(task, clear, set);  	/*  	 * Periodic aggregation shuts off if there is a period of no @@ -777,14 +787,51 @@ void psi_task_change(struct task_struct *task, int clear, int set)  		     wq_worker_last_func(task) == psi_avgs_work))  		wake_clock = false; -	while ((group = iterate_groups(task, &iter))) { -		u32 state_mask = psi_group_change(group, cpu, clear, set); +	while ((group = iterate_groups(task, &iter))) +		psi_group_change(group, cpu, clear, set, wake_clock); +} + +void psi_task_switch(struct task_struct *prev, struct task_struct *next, +		     bool sleep) +{ +	struct psi_group *group, *common = NULL; +	int cpu = task_cpu(prev); +	void *iter; + +	if (next->pid) { +		psi_flags_change(next, 0, TSK_ONCPU); +		/* +		 * When moving state between tasks, the group that +		 * contains them both does not change: we can stop +		 * updating the tree once we reach the first common +		 * ancestor. Iterate @next's ancestors until we +		 * encounter @prev's state. +		 */ +		iter = NULL; +		while ((group = iterate_groups(next, &iter))) { +			if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { +				common = group; +				break; +			} + +			psi_group_change(group, cpu, 0, TSK_ONCPU, true); +		} +	} + +	/* +	 * If this is a voluntary sleep, dequeue will have taken care +	 * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We +	 * only need to deal with it during preemption. +	 */ +	if (sleep) +		return; -		if (state_mask & group->poll_states) -			psi_schedule_poll_work(group, 1); +	if (prev->pid) { +		psi_flags_change(prev, TSK_ONCPU, 0); -		if (wake_clock && !delayed_work_pending(&group->avgs_work)) -			schedule_delayed_work(&group->avgs_work, PSI_FREQ); +		iter = NULL; +		while ((group = iterate_groups(prev, &iter)) && group != common) +			psi_group_change(group, cpu, TSK_ONCPU, 0, true);  	}  } @@ -818,17 +865,17 @@ void psi_memstall_enter(unsigned long *flags)  	if (static_branch_likely(&psi_disabled))  		return; -	*flags = current->flags & PF_MEMSTALL; +	*flags = current->in_memstall;  	if (*flags)  		return;  	/* -	 * PF_MEMSTALL setting & accounting needs to be atomic wrt +	 * in_memstall setting & accounting needs to be atomic wrt  	 * changes to the task's scheduling state, otherwise we can  	 * race with CPU migration.  	 */  	rq = this_rq_lock_irq(&rf); -	current->flags |= PF_MEMSTALL; +	current->in_memstall = 1;  	psi_task_change(current, 0, TSK_MEMSTALL);  	rq_unlock_irq(rq, &rf); @@ -851,13 +898,13 @@ void psi_memstall_leave(unsigned long *flags)  	if (*flags)  		return;  	/* -	 * PF_MEMSTALL clearing & accounting needs to be atomic wrt +	 * in_memstall clearing & accounting needs to be atomic wrt  	 * changes to the task's scheduling state, otherwise we could  	 * race with CPU migration.  	 */  	rq = this_rq_lock_irq(&rf); -	current->flags &= ~PF_MEMSTALL; +	current->in_memstall = 0;  	psi_task_change(current, TSK_MEMSTALL, 0);  	rq_unlock_irq(rq, &rf); @@ -916,12 +963,14 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)  	rq = task_rq_lock(task, &rf); -	if (task_on_rq_queued(task)) +	if (task_on_rq_queued(task)) {  		task_flags = TSK_RUNNING; -	else if (task->in_iowait) +		if (task_current(rq, task)) +			task_flags |= TSK_ONCPU; +	} else if (task->in_iowait)  		task_flags = TSK_IOWAIT; -	if (task->flags & PF_MEMSTALL) +	if (task->in_memstall)  		task_flags |= TSK_MEMSTALL;  	if (task_flags) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4043abe45459..df11d88c9895 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1475,6 +1475,13 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)  		int target = find_lowest_rq(p);  		/* +		 * Bail out if we were forcing a migration to find a better +		 * fitting CPU but our search failed. +		 */ +		if (!test && target != -1 && !rt_task_fits_capacity(p, target)) +			goto out_unlock; + +		/*  		 * Don't bother moving it if the destination CPU is  		 * not running a lower priority task.  		 */ @@ -1482,6 +1489,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)  		    p->prio < cpu_rq(target)->rt.highest_prio.curr)  			cpu = target;  	} + +out_unlock:  	rcu_read_unlock();  out: @@ -1495,7 +1504,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)  	 * let's hope p can move out.  	 */  	if (rq->curr->nr_cpus_allowed == 1 || -	    !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) +	    !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))  		return;  	/* @@ -1503,7 +1512,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)  	 * see if it is pushed or pulled somewhere else.  	 */  	if (p->nr_cpus_allowed != 1 && -	    cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) +	    cpupri_find(&rq->rd->cpupri, p, NULL))  		return;  	/* @@ -1647,8 +1656,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)  static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)  {  	if (!task_running(rq, p) && -	    cpumask_test_cpu(cpu, p->cpus_ptr) && -	    rt_task_fits_capacity(p, cpu)) +	    cpumask_test_cpu(cpu, p->cpus_ptr))  		return 1;  	return 0; @@ -1682,6 +1690,7 @@ static int find_lowest_rq(struct task_struct *task)  	struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);  	int this_cpu = smp_processor_id();  	int cpu      = task_cpu(task); +	int ret;  	/* Make sure the mask is initialized first */  	if (unlikely(!lowest_mask)) @@ -1690,8 +1699,22 @@ static int find_lowest_rq(struct task_struct *task)  	if (task->nr_cpus_allowed == 1)  		return -1; /* No other targets possible */ -	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, -			 rt_task_fits_capacity)) +	/* +	 * If we're on asym system ensure we consider the different capacities +	 * of the CPUs when searching for the lowest_mask. +	 */ +	if (static_branch_unlikely(&sched_asym_cpucapacity)) { + +		ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, +					  task, lowest_mask, +					  rt_task_fits_capacity); +	} else { + +		ret = cpupri_find(&task_rq(task)->rd->cpupri, +				  task, lowest_mask); +	} + +	if (!ret)  		return -1; /* No targets found */  	/* @@ -2202,7 +2225,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)  			    (rq->curr->nr_cpus_allowed < 2 ||  			     rq->curr->prio <= p->prio); -	if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) +	if (need_to_push)  		push_rt_tasks(rq);  } @@ -2274,10 +2297,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)  	 */  	if (task_on_rq_queued(p) && rq->curr != p) {  #ifdef CONFIG_SMP -		bool need_to_push = rq->rt.overloaded || -				    !rt_task_fits_capacity(p, cpu_of(rq)); - -		if (p->nr_cpus_allowed > 1 && need_to_push) +		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)  			rt_queue_push_tasks(rq);  #endif /* CONFIG_SMP */  		if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) @@ -2449,10 +2469,11 @@ const struct sched_class rt_sched_class = {   */  static DEFINE_MUTEX(rt_constraints_mutex); -/* Must be called with tasklist_lock held */  static inline int tg_has_rt_tasks(struct task_group *tg)  { -	struct task_struct *g, *p; +	struct task_struct *task; +	struct css_task_iter it; +	int ret = 0;  	/*  	 * Autogroups do not have RT tasks; see autogroup_create(). @@ -2460,12 +2481,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)  	if (task_group_is_autogroup(tg))  		return 0; -	for_each_process_thread(g, p) { -		if (rt_task(p) && task_group(p) == tg) -			return 1; -	} +	css_task_iter_start(&tg->css, 0, &it); +	while (!ret && (task = css_task_iter_next(&it))) +		ret |= rt_task(task); +	css_task_iter_end(&it); -	return 0; +	return ret;  }  struct rt_schedulable_data { @@ -2496,9 +2517,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)  		return -EINVAL;  	/* -	 * Ensure we don't starve existing RT tasks. +	 * Ensure we don't starve existing RT tasks if runtime turns zero.  	 */ -	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) +	if (rt_bandwidth_enabled() && !runtime && +	    tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))  		return -EBUSY;  	total = to_ratio(period, runtime); @@ -2564,7 +2586,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,  		return -EINVAL;  	mutex_lock(&rt_constraints_mutex); -	read_lock(&tasklist_lock);  	err = __rt_schedulable(tg, rt_period, rt_runtime);  	if (err)  		goto unlock; @@ -2582,7 +2603,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,  	}  	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);  unlock: -	read_unlock(&tasklist_lock);  	mutex_unlock(&rt_constraints_mutex);  	return err; @@ -2641,9 +2661,7 @@ static int sched_rt_global_constraints(void)  	int ret = 0;  	mutex_lock(&rt_constraints_mutex); -	read_lock(&tasklist_lock);  	ret = __rt_schedulable(NULL, 0, 0); -	read_unlock(&tasklist_lock);  	mutex_unlock(&rt_constraints_mutex);  	return ret; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9ea647835fd6..db3a57675ccf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -118,7 +118,13 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust);  #ifdef CONFIG_64BIT  # define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)  # define scale_load(w)		((w) << SCHED_FIXEDPOINT_SHIFT) -# define scale_load_down(w)	((w) >> SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) \ +({ \ +	unsigned long __w = (w); \ +	if (__w) \ +		__w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ +	__w; \ +})  #else  # define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT)  # define scale_load(w)		(w) @@ -189,6 +195,12 @@ static inline int task_has_dl_policy(struct task_struct *p)  #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +static inline void update_avg(u64 *avg, u64 sample) +{ +	s64 diff = sample - *avg; +	*avg += diff / 8; +} +  /*   * !! For sched_setattr_nocheck() (kernel) only !!   * @@ -305,7 +317,6 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)  	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;  } -extern void dl_change_utilization(struct task_struct *p, u64 new_bw);  extern void init_dl_bw(struct dl_bw *dl_b);  extern int  sched_dl_global_validate(void);  extern void sched_dl_do_global(void); @@ -489,7 +500,6 @@ struct cfs_bandwidth { };  /* CFS-related fields in a runqueue */  struct cfs_rq {  	struct load_weight	load; -	unsigned long		runnable_weight;  	unsigned int		nr_running;  	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */  	unsigned int		idle_h_nr_running; /* SCHED_IDLE */ @@ -528,7 +538,7 @@ struct cfs_rq {  		int		nr;  		unsigned long	load_avg;  		unsigned long	util_avg; -		unsigned long	runnable_sum; +		unsigned long	runnable_avg;  	} removed;  #ifdef CONFIG_FAIR_GROUP_SCHED @@ -688,8 +698,30 @@ struct dl_rq {  #ifdef CONFIG_FAIR_GROUP_SCHED  /* An entity is a task if it doesn't "own" a runqueue */  #define entity_is_task(se)	(!se->my_q) + +static inline void se_update_runnable(struct sched_entity *se) +{ +	if (!entity_is_task(se)) +		se->runnable_weight = se->my_q->h_nr_running; +} + +static inline long se_runnable(struct sched_entity *se) +{ +	if (entity_is_task(se)) +		return !!se->on_rq; +	else +		return se->runnable_weight; +} +  #else  #define entity_is_task(se)	1 + +static inline void se_update_runnable(struct sched_entity *se) {} + +static inline long se_runnable(struct sched_entity *se) +{ +	return !!se->on_rq; +}  #endif  #ifdef CONFIG_SMP @@ -701,10 +733,6 @@ static inline long se_weight(struct sched_entity *se)  	return scale_load_down(se->load.weight);  } -static inline long se_runnable(struct sched_entity *se) -{ -	return scale_load_down(se->runnable_weight); -}  static inline bool sched_asym_prefer(int a, int b)  { @@ -860,7 +888,6 @@ struct rq {  #endif  #ifdef CONFIG_NO_HZ_COMMON  #ifdef CONFIG_SMP -	unsigned long		last_load_update_tick;  	unsigned long		last_blocked_load_update_tick;  	unsigned int		has_blocked_load;  #endif /* CONFIG_SMP */ @@ -944,6 +971,9 @@ struct rq {  #ifdef CONFIG_HAVE_SCHED_AVG_IRQ  	struct sched_avg	avg_irq;  #endif +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +	struct sched_avg	avg_thermal; +#endif  	u64			idle_stamp;  	u64			avg_idle; @@ -967,7 +997,6 @@ struct rq {  #ifdef CONFIG_SCHED_HRTICK  #ifdef CONFIG_SMP -	int			hrtick_csd_pending;  	call_single_data_t	hrtick_csd;  #endif  	struct hrtimer		hrtick_timer; @@ -1107,6 +1136,24 @@ static inline u64 rq_clock_task(struct rq *rq)  	return rq->clock_task;  } +/** + * By default the decay is the default pelt decay period. + * The decay shift can change the decay period in + * multiples of 32. + *  Decay shift		Decay period(ms) + *	0			32 + *	1			64 + *	2			128 + *	3			256 + *	4			512 + */ +extern int sched_thermal_decay_shift; + +static inline u64 rq_clock_thermal(struct rq *rq) +{ +	return rq_clock_task(rq) >> sched_thermal_decay_shift; +} +  static inline void rq_clock_skip_update(struct rq *rq)  {  	lockdep_assert_held(&rq->lock); @@ -1337,8 +1384,6 @@ extern void sched_ttwu_pending(void);  	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \  			__sd; __sd = __sd->parent) -#define for_each_lower_domain(sd) for (; sd; sd = sd->child) -  /**   * highest_flag_domain - Return highest sched_domain containing flag.   * @cpu:	The CPU whose highest level of sched domain is to @@ -1869,7 +1914,6 @@ extern struct dl_bandwidth def_dl_bandwidth;  extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);  extern void init_dl_task_timer(struct sched_dl_entity *dl_se);  extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); -extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);  #define BW_SHIFT		20  #define BW_UNIT			(1 << BW_SHIFT) @@ -1968,6 +2012,13 @@ static inline int hrtick_enabled(struct rq *rq)  #endif /* CONFIG_SCHED_HRTICK */ +#ifndef arch_scale_freq_tick +static __always_inline +void arch_scale_freq_tick(void) +{ +} +#endif +  #ifndef arch_scale_freq_capacity  static __always_inline  unsigned long arch_scale_freq_capacity(int cpu) @@ -2492,3 +2543,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)  	return true;  }  #endif + +void swake_up_all_locked(struct swait_queue_head *q); +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index ba683fe81a6e..33d0daf83842 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -70,7 +70,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)  		return;  	if (!wakeup || p->sched_psi_wake_requeue) { -		if (p->flags & PF_MEMSTALL) +		if (p->in_memstall)  			set |= TSK_MEMSTALL;  		if (p->sched_psi_wake_requeue)  			p->sched_psi_wake_requeue = 0; @@ -90,9 +90,17 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)  		return;  	if (!sleep) { -		if (p->flags & PF_MEMSTALL) +		if (p->in_memstall)  			clear |= TSK_MEMSTALL;  	} else { +		/* +		 * When a task sleeps, schedule() dequeues it before +		 * switching to the next one. Merge the clearing of +		 * TSK_RUNNING and TSK_ONCPU to save an unnecessary +		 * psi_task_change() call in psi_sched_switch(). +		 */ +		clear |= TSK_ONCPU; +  		if (p->in_iowait)  			set |= TSK_IOWAIT;  	} @@ -109,14 +117,14 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)  	 * deregister its sleep-persistent psi states from the old  	 * queue, and let psi_enqueue() know it has to requeue.  	 */ -	if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { +	if (unlikely(p->in_iowait || p->in_memstall)) {  		struct rq_flags rf;  		struct rq *rq;  		int clear = 0;  		if (p->in_iowait)  			clear |= TSK_IOWAIT; -		if (p->flags & PF_MEMSTALL) +		if (p->in_memstall)  			clear |= TSK_MEMSTALL;  		rq = __task_rq_lock(p, &rf); @@ -126,18 +134,31 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)  	}  } +static inline void psi_sched_switch(struct task_struct *prev, +				    struct task_struct *next, +				    bool sleep) +{ +	if (static_branch_likely(&psi_disabled)) +		return; + +	psi_task_switch(prev, next, sleep); +} +  static inline void psi_task_tick(struct rq *rq)  {  	if (static_branch_likely(&psi_disabled))  		return; -	if (unlikely(rq->curr->flags & PF_MEMSTALL)) +	if (unlikely(rq->curr->in_memstall))  		psi_memstall_tick(rq->curr, cpu_of(rq));  }  #else /* CONFIG_PSI */  static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}  static inline void psi_dequeue(struct task_struct *p, bool sleep) {}  static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_sched_switch(struct task_struct *prev, +				    struct task_struct *next, +				    bool sleep) {}  static inline void psi_task_tick(struct rq *rq) {}  #endif /* CONFIG_PSI */ diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e83a3f8449f6..e1c655f928c7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -32,6 +32,19 @@ void swake_up_locked(struct swait_queue_head *q)  }  EXPORT_SYMBOL(swake_up_locked); +/* + * Wake up all waiters. This is an interface which is solely exposed for + * completions and not for general usage. + * + * It is intentionally different from swake_up_all() to allow usage from + * hard interrupt context and interrupt disabled regions. + */ +void swake_up_all_locked(struct swait_queue_head *q) +{ +	while (!list_empty(&q->task_list)) +		swake_up_locked(q); +} +  void swake_up_one(struct swait_queue_head *q)  {  	unsigned long flags; @@ -69,7 +82,7 @@ void swake_up_all(struct swait_queue_head *q)  }  EXPORT_SYMBOL(swake_up_all); -static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)  {  	wait->task = current;  	if (list_empty(&wait->task_list)) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dfb64c08a407..8344757bba6e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -317,8 +317,9 @@ static void sched_energy_set(bool has_eas)   * EAS can be used on a root domain if it meets all the following conditions:   *    1. an Energy Model (EM) is available;   *    2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. - *    3. the EM complexity is low enough to keep scheduling overheads low; - *    4. schedutil is driving the frequency of all CPUs of the rd; + *    3. no SMT is detected. + *    4. the EM complexity is low enough to keep scheduling overheads low; + *    5. schedutil is driving the frequency of all CPUs of the rd;   *   * The complexity of the Energy Model is defined as:   * @@ -360,6 +361,13 @@ static bool build_perf_domains(const struct cpumask *cpu_map)  		goto free;  	} +	/* EAS definitely does *not* handle SMT */ +	if (sched_smt_active()) { +		pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", +			cpumask_pr_args(cpu_map)); +		goto free; +	} +  	for_each_cpu(i, cpu_map) {  		/* Skip already covered CPUs. */  		if (find_pd(pd, i)) @@ -1374,18 +1382,9 @@ sd_init(struct sched_domain_topology_level *tl,  	 * Convert topological properties into behaviour.  	 */ -	if (sd->flags & SD_ASYM_CPUCAPACITY) { -		struct sched_domain *t = sd; - -		/* -		 * Don't attempt to spread across CPUs of different capacities. -		 */ -		if (sd->child) -			sd->child->flags &= ~SD_PREFER_SIBLING; - -		for_each_lower_domain(t) -			t->flags |= SD_BALANCE_WAKE; -	} +	/* Don't attempt to spread across CPUs of different capacities. */ +	if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) +		sd->child->flags &= ~SD_PREFER_SIBLING;  	if (sd->flags & SD_SHARE_CPUCAPACITY) {  		sd->imbalance_pct = 110; |