diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 341 | 
1 files changed, 268 insertions, 73 deletions
| diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b39fb596f6c1..ee271bb661cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)  static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);  static unsigned long task_h_load(struct task_struct *p); +static unsigned long capacity_of(int cpu);  /* Give new sched_entity start runnable values to heavy its load in infant time */  void init_entity_runnable_average(struct sched_entity *se) @@ -1392,6 +1393,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,  	int last_cpupid, this_cpupid;  	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); +	last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + +	/* +	 * Allow first faults or private faults to migrate immediately early in +	 * the lifetime of a task. The magic number 4 is based on waiting for +	 * two full passes of the "multi-stage node selection" test that is +	 * executed below. +	 */ +	if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && +	    (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) +		return true;  	/*  	 * Multi-stage node selection is used in conjunction with a periodic @@ -1410,7 +1422,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,  	 * This quadric squishes small probabilities, making it less likely we  	 * act on an unlikely task<->page relation.  	 */ -	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);  	if (!cpupid_pid_unset(last_cpupid) &&  				cpupid_to_nid(last_cpupid) != dst_nid)  		return false; @@ -1446,7 +1457,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,  static unsigned long weighted_cpuload(struct rq *rq);  static unsigned long source_load(int cpu, int type);  static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu);  /* Cached statistics for all CPUs within a node */  struct numa_stats { @@ -1454,8 +1464,6 @@ struct numa_stats {  	/* Total compute capacity of CPUs on a node */  	unsigned long compute_capacity; - -	unsigned int nr_running;  };  /* @@ -1463,36 +1471,16 @@ struct numa_stats {   */  static void update_numa_stats(struct numa_stats *ns, int nid)  { -	int smt, cpu, cpus = 0; -	unsigned long capacity; +	int cpu;  	memset(ns, 0, sizeof(*ns));  	for_each_cpu(cpu, cpumask_of_node(nid)) {  		struct rq *rq = cpu_rq(cpu); -		ns->nr_running += rq->nr_running;  		ns->load += weighted_cpuload(rq);  		ns->compute_capacity += capacity_of(cpu); - -		cpus++;  	} -	/* -	 * If we raced with hotplug and there are no CPUs left in our mask -	 * the @ns structure is NULL'ed and task_numa_compare() will -	 * not find this node attractive. -	 * -	 * We'll detect a huge imbalance and bail there. -	 */ -	if (!cpus) -		return; - -	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ -	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); -	capacity = cpus / smt; /* cores */ - -	capacity = min_t(unsigned, capacity, -		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));  }  struct task_numa_env { @@ -1514,6 +1502,21 @@ struct task_numa_env {  static void task_numa_assign(struct task_numa_env *env,  			     struct task_struct *p, long imp)  { +	struct rq *rq = cpu_rq(env->dst_cpu); + +	/* Bail out if run-queue part of active NUMA balance. */ +	if (xchg(&rq->numa_migrate_on, 1)) +		return; + +	/* +	 * Clear previous best_cpu/rq numa-migrate flag, since task now +	 * found a better CPU to move/swap. +	 */ +	if (env->best_cpu != -1) { +		rq = cpu_rq(env->best_cpu); +		WRITE_ONCE(rq->numa_migrate_on, 0); +	} +  	if (env->best_task)  		put_task_struct(env->best_task);  	if (p) @@ -1553,6 +1556,13 @@ static bool load_too_imbalanced(long src_load, long dst_load,  }  /* + * Maximum NUMA importance can be 1998 (2*999); + * SMALLIMP @ 30 would be close to 1998/64. + * Used to deter task migration. + */ +#define SMALLIMP	30 + +/*   * This checks if the overall compute and NUMA accesses of the system would   * be improved if the source tasks was migrated to the target dst_cpu taking   * into account that it might be best if task running on the dst_cpu should @@ -1569,6 +1579,9 @@ static void task_numa_compare(struct task_numa_env *env,  	long moveimp = imp;  	int dist = env->dist; +	if (READ_ONCE(dst_rq->numa_migrate_on)) +		return; +  	rcu_read_lock();  	cur = task_rcu_dereference(&dst_rq->curr);  	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) @@ -1582,7 +1595,7 @@ static void task_numa_compare(struct task_numa_env *env,  		goto unlock;  	if (!cur) { -		if (maymove || imp > env->best_imp) +		if (maymove && moveimp >= env->best_imp)  			goto assign;  		else  			goto unlock; @@ -1625,16 +1638,22 @@ static void task_numa_compare(struct task_numa_env *env,  			       task_weight(cur, env->dst_nid, dist);  	} -	if (imp <= env->best_imp) -		goto unlock; -  	if (maymove && moveimp > imp && moveimp > env->best_imp) { -		imp = moveimp - 1; +		imp = moveimp;  		cur = NULL;  		goto assign;  	}  	/* +	 * If the NUMA importance is less than SMALLIMP, +	 * task migration might only result in ping pong +	 * of tasks and also hurt performance due to cache +	 * misses. +	 */ +	if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) +		goto unlock; + +	/*  	 * In the overloaded case, try and keep the load balanced.  	 */  	load = task_h_load(env->p) - task_h_load(cur); @@ -1710,6 +1729,7 @@ static int task_numa_migrate(struct task_struct *p)  		.best_cpu = -1,  	};  	struct sched_domain *sd; +	struct rq *best_rq;  	unsigned long taskweight, groupweight;  	int nid, ret, dist;  	long taskimp, groupimp; @@ -1805,20 +1825,17 @@ static int task_numa_migrate(struct task_struct *p)  	if (env.best_cpu == -1)  		return -EAGAIN; -	/* -	 * Reset the scan period if the task is being rescheduled on an -	 * alternative node to recheck if the tasks is now properly placed. -	 */ -	p->numa_scan_period = task_scan_start(p); - +	best_rq = cpu_rq(env.best_cpu);  	if (env.best_task == NULL) {  		ret = migrate_task_to(p, env.best_cpu); +		WRITE_ONCE(best_rq->numa_migrate_on, 0);  		if (ret != 0)  			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);  		return ret;  	}  	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); +	WRITE_ONCE(best_rq->numa_migrate_on, 0);  	if (ret != 0)  		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); @@ -2596,6 +2613,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)  	}  } +static void update_scan_period(struct task_struct *p, int new_cpu) +{ +	int src_nid = cpu_to_node(task_cpu(p)); +	int dst_nid = cpu_to_node(new_cpu); + +	if (!static_branch_likely(&sched_numa_balancing)) +		return; + +	if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) +		return; + +	if (src_nid == dst_nid) +		return; + +	/* +	 * Allow resets if faults have been trapped before one scan +	 * has completed. This is most likely due to a new task that +	 * is pulled cross-node due to wakeups or load balancing. +	 */ +	if (p->numa_scan_seq) { +		/* +		 * Avoid scan adjustments if moving to the preferred +		 * node or if the task was not previously running on +		 * the preferred node. +		 */ +		if (dst_nid == p->numa_preferred_nid || +		    (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) +			return; +	} + +	p->numa_scan_period = task_scan_start(p); +} +  #else  static void task_tick_numa(struct rq *rq, struct task_struct *curr)  { @@ -2609,6 +2659,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)  {  } +static inline void update_scan_period(struct task_struct *p, int new_cpu) +{ +} +  #endif /* CONFIG_NUMA_BALANCING */  static void @@ -3362,6 +3416,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)   * attach_entity_load_avg - attach this entity to its cfs_rq load avg   * @cfs_rq: cfs_rq to attach to   * @se: sched_entity to attach + * @flags: migration hints   *   * Must call update_cfs_rq_load_avg() before this, since we rely on   * cfs_rq->avg.last_update_time being current. @@ -3646,6 +3701,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)  	WRITE_ONCE(p->se.avg.util_est, ue);  } +static inline int task_fits_capacity(struct task_struct *p, long capacity) +{ +	return capacity * 1024 > task_util_est(p) * capacity_margin; +} + +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) +{ +	if (!static_branch_unlikely(&sched_asym_cpucapacity)) +		return; + +	if (!p) { +		rq->misfit_task_load = 0; +		return; +	} + +	if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { +		rq->misfit_task_load = 0; +		return; +	} + +	rq->misfit_task_load = task_h_load(p); +} +  #else /* CONFIG_SMP */  #define UPDATE_TG	0x0 @@ -3675,6 +3753,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}  static inline void  util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,  		 bool task_sleep) {} +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}  #endif /* CONFIG_SMP */ @@ -3924,7 +4003,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	 * put back on, and if we advance min_vruntime, we'll be placed back  	 * further than we started -- ie. we'll be penalized.  	 */ -	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) +	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)  		update_min_vruntime(cfs_rq);  } @@ -4399,9 +4478,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	/*  	 * Add to the _head_ of the list, so that an already-started -	 * distribute_cfs_runtime will not see us +	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is +	 * not running add to the tail so that later runqueues don't get starved.  	 */ -	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); +	if (cfs_b->distribute_running) +		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); +	else +		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);  	/*  	 * If we're the first throttled task, make sure the bandwidth @@ -4545,14 +4628,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)  	 * in us over-using our runtime if it is all used during this loop, but  	 * only by limited amounts in that extreme case.  	 */ -	while (throttled && cfs_b->runtime > 0) { +	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {  		runtime = cfs_b->runtime; +		cfs_b->distribute_running = 1;  		raw_spin_unlock(&cfs_b->lock);  		/* we can't nest cfs_b->lock while distributing bandwidth */  		runtime = distribute_cfs_runtime(cfs_b, runtime,  						 runtime_expires);  		raw_spin_lock(&cfs_b->lock); +		cfs_b->distribute_running = 0;  		throttled = !list_empty(&cfs_b->throttled_cfs_rq);  		cfs_b->runtime -= min(runtime, cfs_b->runtime); @@ -4663,6 +4748,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  	/* confirm we're still not at a refresh boundary */  	raw_spin_lock(&cfs_b->lock); +	if (cfs_b->distribute_running) { +		raw_spin_unlock(&cfs_b->lock); +		return; +	} +  	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {  		raw_spin_unlock(&cfs_b->lock);  		return; @@ -4672,6 +4762,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  		runtime = cfs_b->runtime;  	expires = cfs_b->runtime_expires; +	if (runtime) +		cfs_b->distribute_running = 1; +  	raw_spin_unlock(&cfs_b->lock);  	if (!runtime) @@ -4682,6 +4775,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  	raw_spin_lock(&cfs_b->lock);  	if (expires == cfs_b->runtime_expires)  		cfs_b->runtime -= min(runtime, cfs_b->runtime); +	cfs_b->distribute_running = 0;  	raw_spin_unlock(&cfs_b->lock);  } @@ -4790,6 +4884,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  	cfs_b->period_timer.function = sched_cfs_period_timer;  	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);  	cfs_b->slack_timer.function = sched_cfs_slack_timer; +	cfs_b->distribute_running = 0;  }  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) @@ -6187,6 +6282,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)  {  	long min_cap, max_cap; +	if (!static_branch_unlikely(&sched_asym_cpucapacity)) +		return 0; +  	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));  	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; @@ -6197,7 +6295,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)  	/* Bring task utilization in sync with prev_cpu */  	sync_entity_load_avg(&p->se); -	return min_cap * 1024 < task_util(p) * capacity_margin; +	return !task_fits_capacity(p, min_cap);  }  /* @@ -6274,7 +6372,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se);   * cfs_rq_of(p) references at time of call are still valid and identify the   * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.   */ -static void migrate_task_rq_fair(struct task_struct *p) +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)  {  	/*  	 * As blocked tasks retain absolute vruntime the migration needs to @@ -6327,6 +6425,8 @@ static void migrate_task_rq_fair(struct task_struct *p)  	/* We have migrated, no longer consider this task hot */  	p->se.exec_start = 0; + +	update_scan_period(p, new_cpu);  }  static void task_dead_fair(struct task_struct *p) @@ -6614,9 +6714,12 @@ done: __maybe_unused;  	if (hrtick_enabled(rq))  		hrtick_start_fair(rq, p); +	update_misfit_status(p, rq); +  	return p;  idle: +	update_misfit_status(NULL, rq);  	new_tasks = idle_balance(rq, rf);  	/* @@ -6822,6 +6925,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;  enum fbq_type { regular, remote, all }; +enum group_type { +	group_other = 0, +	group_misfit_task, +	group_imbalanced, +	group_overloaded, +}; +  #define LBF_ALL_PINNED	0x01  #define LBF_NEED_BREAK	0x02  #define LBF_DST_PINNED  0x04 @@ -6852,6 +6962,7 @@ struct lb_env {  	unsigned int		loop_max;  	enum fbq_type		fbq_type; +	enum group_type		src_grp_type;  	struct list_head	tasks;  }; @@ -7232,7 +7343,7 @@ static inline bool others_have_blocked(struct rq *rq)  	if (READ_ONCE(rq->avg_dl.util_avg))  		return true; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ  	if (READ_ONCE(rq->avg_irq.util_avg))  		return true;  #endif @@ -7263,6 +7374,7 @@ static void update_blocked_averages(int cpu)  {  	struct rq *rq = cpu_rq(cpu);  	struct cfs_rq *cfs_rq, *pos; +	const struct sched_class *curr_class;  	struct rq_flags rf;  	bool done = true; @@ -7299,8 +7411,10 @@ static void update_blocked_averages(int cpu)  		if (cfs_rq_has_blocked(cfs_rq))  			done = false;  	} -	update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); -	update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + +	curr_class = rq->curr->sched_class; +	update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); +	update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);  	update_irq_load_avg(rq, 0);  	/* Don't need periodic decay once load/util_avg are null */  	if (others_have_blocked(rq)) @@ -7365,13 +7479,16 @@ static inline void update_blocked_averages(int cpu)  {  	struct rq *rq = cpu_rq(cpu);  	struct cfs_rq *cfs_rq = &rq->cfs; +	const struct sched_class *curr_class;  	struct rq_flags rf;  	rq_lock_irqsave(rq, &rf);  	update_rq_clock(rq);  	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); -	update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); -	update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + +	curr_class = rq->curr->sched_class; +	update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); +	update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);  	update_irq_load_avg(rq, 0);  #ifdef CONFIG_NO_HZ_COMMON  	rq->last_blocked_load_update_tick = jiffies; @@ -7389,12 +7506,6 @@ static unsigned long task_h_load(struct task_struct *p)  /********** Helpers for find_busiest_group ************************/ -enum group_type { -	group_other = 0, -	group_imbalanced, -	group_overloaded, -}; -  /*   * sg_lb_stats - stats of a sched_group required for load_balancing   */ @@ -7410,6 +7521,7 @@ struct sg_lb_stats {  	unsigned int group_weight;  	enum group_type group_type;  	int group_no_capacity; +	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */  #ifdef CONFIG_NUMA_BALANCING  	unsigned int nr_numa_running;  	unsigned int nr_preferred_running; @@ -7482,10 +7594,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd,  	return load_idx;  } -static unsigned long scale_rt_capacity(int cpu) +static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)  {  	struct rq *rq = cpu_rq(cpu); -	unsigned long max = arch_scale_cpu_capacity(NULL, cpu); +	unsigned long max = arch_scale_cpu_capacity(sd, cpu);  	unsigned long used, free;  	unsigned long irq; @@ -7507,7 +7619,7 @@ static unsigned long scale_rt_capacity(int cpu)  static void update_cpu_capacity(struct sched_domain *sd, int cpu)  { -	unsigned long capacity = scale_rt_capacity(cpu); +	unsigned long capacity = scale_rt_capacity(sd, cpu);  	struct sched_group *sdg = sd->groups;  	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); @@ -7518,13 +7630,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)  	cpu_rq(cpu)->cpu_capacity = capacity;  	sdg->sgc->capacity = capacity;  	sdg->sgc->min_capacity = capacity; +	sdg->sgc->max_capacity = capacity;  }  void update_group_capacity(struct sched_domain *sd, int cpu)  {  	struct sched_domain *child = sd->child;  	struct sched_group *group, *sdg = sd->groups; -	unsigned long capacity, min_capacity; +	unsigned long capacity, min_capacity, max_capacity;  	unsigned long interval;  	interval = msecs_to_jiffies(sd->balance_interval); @@ -7538,6 +7651,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)  	capacity = 0;  	min_capacity = ULONG_MAX; +	max_capacity = 0;  	if (child->flags & SD_OVERLAP) {  		/* @@ -7568,6 +7682,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)  			}  			min_capacity = min(capacity, min_capacity); +			max_capacity = max(capacity, max_capacity);  		}  	} else  {  		/* @@ -7581,12 +7696,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)  			capacity += sgc->capacity;  			min_capacity = min(sgc->min_capacity, min_capacity); +			max_capacity = max(sgc->max_capacity, max_capacity);  			group = group->next;  		} while (group != child->groups);  	}  	sdg->sgc->capacity = capacity;  	sdg->sgc->min_capacity = min_capacity; +	sdg->sgc->max_capacity = max_capacity;  }  /* @@ -7682,16 +7799,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)  }  /* - * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller   * per-CPU capacity than sched_group ref.   */  static inline bool -group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)  {  	return sg->sgc->min_capacity * capacity_margin <  						ref->sgc->min_capacity * 1024;  } +/* + * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity_orig than sched_group ref. + */ +static inline bool +group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ +	return sg->sgc->max_capacity * capacity_margin < +						ref->sgc->max_capacity * 1024; +} +  static inline enum  group_type group_classify(struct sched_group *group,  			  struct sg_lb_stats *sgs) @@ -7702,6 +7830,9 @@ group_type group_classify(struct sched_group *group,  	if (sg_imbalanced(group))  		return group_imbalanced; +	if (sgs->group_misfit_task_load) +		return group_misfit_task; +  	return group_other;  } @@ -7734,7 +7865,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)   * @load_idx: Load index of sched_domain of this_cpu for load calc.   * @local_group: Does group contain this_cpu.   * @sgs: variable to hold the statistics for this group. - * @overload: Indicate more than one runnable task for any CPU. + * @overload: Indicate pullable load (e.g. >1 runnable task).   */  static inline void update_sg_lb_stats(struct lb_env *env,  			struct sched_group *group, int load_idx, @@ -7776,6 +7907,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,  		 */  		if (!nr_running && idle_cpu(i))  			sgs->idle_cpus++; + +		if (env->sd->flags & SD_ASYM_CPUCAPACITY && +		    sgs->group_misfit_task_load < rq->misfit_task_load) { +			sgs->group_misfit_task_load = rq->misfit_task_load; +			*overload = 1; +		}  	}  	/* Adjust by relative CPU capacity of the group */ @@ -7811,6 +7948,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,  {  	struct sg_lb_stats *busiest = &sds->busiest_stat; +	/* +	 * Don't try to pull misfit tasks we can't help. +	 * We can use max_capacity here as reduction in capacity on some +	 * CPUs in the group should either be possible to resolve +	 * internally or be covered by avg_load imbalance (eventually). +	 */ +	if (sgs->group_type == group_misfit_task && +	    (!group_smaller_max_cpu_capacity(sg, sds->local) || +	     !group_has_capacity(env, &sds->local_stat))) +		return false; +  	if (sgs->group_type > busiest->group_type)  		return true; @@ -7830,7 +7978,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,  	 * power/energy consequences are not considered.  	 */  	if (sgs->sum_nr_running <= sgs->group_weight && -	    group_smaller_cpu_capacity(sds->local, sg)) +	    group_smaller_min_cpu_capacity(sds->local, sg)) +		return false; + +	/* +	 * If we have more than one misfit sg go with the biggest misfit. +	 */ +	if (sgs->group_type == group_misfit_task && +	    sgs->group_misfit_task_load < busiest->group_misfit_task_load)  		return false;  asym_packing: @@ -7901,11 +8056,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd  	struct sched_group *sg = env->sd->groups;  	struct sg_lb_stats *local = &sds->local_stat;  	struct sg_lb_stats tmp_sgs; -	int load_idx, prefer_sibling = 0; +	int load_idx;  	bool overload = false; - -	if (child && child->flags & SD_PREFER_SIBLING) -		prefer_sibling = 1; +	bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;  #ifdef CONFIG_NO_HZ_COMMON  	if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) @@ -7979,8 +8132,8 @@ next_group:  	if (!env->sd->parent) {  		/* update overload indicator if we are at root domain */ -		if (env->dst_rq->rd->overload != overload) -			env->dst_rq->rd->overload = overload; +		if (READ_ONCE(env->dst_rq->rd->overload) != overload) +			WRITE_ONCE(env->dst_rq->rd->overload, overload);  	}  } @@ -8130,8 +8283,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s  	 * factors in sg capacity and sgs with smaller group_type are  	 * skipped when updating the busiest sg:  	 */ -	if (busiest->avg_load <= sds->avg_load || -	    local->avg_load >= sds->avg_load) { +	if (busiest->group_type != group_misfit_task && +	    (busiest->avg_load <= sds->avg_load || +	     local->avg_load >= sds->avg_load)) {  		env->imbalance = 0;  		return fix_small_imbalance(env, sds);  	} @@ -8165,6 +8319,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s  		(sds->avg_load - local->avg_load) * local->group_capacity  	) / SCHED_CAPACITY_SCALE; +	/* Boost imbalance to allow misfit task to be balanced. */ +	if (busiest->group_type == group_misfit_task) { +		env->imbalance = max_t(long, env->imbalance, +				       busiest->group_misfit_task_load); +	} +  	/*  	 * if *imbalance is less than the average load per runnable task  	 * there is no guarantee that any tasks will be moved so we'll have @@ -8231,6 +8391,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)  	    busiest->group_no_capacity)  		goto force_balance; +	/* Misfit tasks should be dealt with regardless of the avg load */ +	if (busiest->group_type == group_misfit_task) +		goto force_balance; +  	/*  	 * If the local group is busier than the selected busiest group  	 * don't try and pull any tasks. @@ -8268,8 +8432,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)  force_balance:  	/* Looks like there is an imbalance. Compute it */ +	env->src_grp_type = busiest->group_type;  	calculate_imbalance(env, &sds); -	return sds.busiest; +	return env->imbalance ? sds.busiest : NULL;  out_balanced:  	env->imbalance = 0; @@ -8315,8 +8480,32 @@ static struct rq *find_busiest_queue(struct lb_env *env,  		if (rt > env->fbq_type)  			continue; +		/* +		 * For ASYM_CPUCAPACITY domains with misfit tasks we simply +		 * seek the "biggest" misfit task. +		 */ +		if (env->src_grp_type == group_misfit_task) { +			if (rq->misfit_task_load > busiest_load) { +				busiest_load = rq->misfit_task_load; +				busiest = rq; +			} + +			continue; +		} +  		capacity = capacity_of(i); +		/* +		 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could +		 * eventually lead to active_balancing high->low capacity. +		 * Higher per-CPU capacity is considered better than balancing +		 * average load. +		 */ +		if (env->sd->flags & SD_ASYM_CPUCAPACITY && +		    capacity_of(env->dst_cpu) < capacity && +		    rq->nr_running == 1) +			continue; +  		wl = weighted_cpuload(rq);  		/* @@ -8384,6 +8573,9 @@ static int need_active_balance(struct lb_env *env)  			return 1;  	} +	if (env->src_grp_type == group_misfit_task) +		return 1; +  	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);  } @@ -9026,7 +9218,7 @@ static void nohz_balancer_kick(struct rq *rq)  	if (time_before(now, nohz.next_balance))  		goto out; -	if (rq->nr_running >= 2) { +	if (rq->nr_running >= 2 || rq->misfit_task_load) {  		flags = NOHZ_KICK_MASK;  		goto out;  	} @@ -9395,7 +9587,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)  	rq_unpin_lock(this_rq, rf);  	if (this_rq->avg_idle < sysctl_sched_migration_cost || -	    !this_rq->rd->overload) { +	    !READ_ONCE(this_rq->rd->overload)) {  		rcu_read_lock();  		sd = rcu_dereference_check_sched_domain(this_rq->sd); @@ -9557,6 +9749,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)  	if (static_branch_unlikely(&sched_numa_balancing))  		task_tick_numa(rq, curr); + +	update_misfit_status(curr, rq);  }  /* @@ -9638,7 +9832,8 @@ static inline bool vruntime_normalized(struct task_struct *p)  	 * - A task which has been woken up by try_to_wake_up() and  	 *   waiting for actually being woken up by sched_ttwu_pending().  	 */ -	if (!se->sum_exec_runtime || p->state == TASK_WAKING) +	if (!se->sum_exec_runtime || +	    (p->state == TASK_WAKING && p->sched_remote_wakeup))  		return true;  	return false; |