diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 323 | 
1 files changed, 280 insertions, 43 deletions
| diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e4a0b8bd941c..c36aa54ae071 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -178,6 +178,11 @@ int __weak arch_asym_cpu_priority(int cpu)  static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;  #endif +#ifdef CONFIG_NUMA_BALANCING +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +#endif +  #ifdef CONFIG_SYSCTL  static struct ctl_table sched_fair_sysctls[] = {  	{ @@ -197,6 +202,16 @@ static struct ctl_table sched_fair_sysctls[] = {  		.extra1         = SYSCTL_ONE,  	},  #endif +#ifdef CONFIG_NUMA_BALANCING +	{ +		.procname	= "numa_balancing_promote_rate_limit_MBps", +		.data		= &sysctl_numa_balancing_promote_rate_limit, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= SYSCTL_ZERO, +	}, +#endif /* CONFIG_NUMA_BALANCING */  	{}  }; @@ -1094,9 +1109,6 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000;  /* The page with hint page fault latency < threshold in ms is considered hot */  unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; -/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ -unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; -  struct numa_group {  	refcount_t refcount; @@ -2964,7 +2976,7 @@ static void task_numa_work(struct callback_head *work)  	}  	next_scan = now + msecs_to_jiffies(p->numa_scan_period); -	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) +	if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))  		return;  	/* @@ -4280,14 +4292,16 @@ static inline unsigned long task_util_est(struct task_struct *p)  }  #ifdef CONFIG_UCLAMP_TASK -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long uclamp_task_util(struct task_struct *p, +					     unsigned long uclamp_min, +					     unsigned long uclamp_max)  { -	return clamp(task_util_est(p), -		     uclamp_eff_value(p, UCLAMP_MIN), -		     uclamp_eff_value(p, UCLAMP_MAX)); +	return clamp(task_util_est(p), uclamp_min, uclamp_max);  }  #else -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long uclamp_task_util(struct task_struct *p, +					     unsigned long uclamp_min, +					     unsigned long uclamp_max)  {  	return task_util_est(p);  } @@ -4426,10 +4440,139 @@ done:  	trace_sched_util_est_se_tp(&p->se);  } -static inline int task_fits_capacity(struct task_struct *p, -				     unsigned long capacity) +static inline int util_fits_cpu(unsigned long util, +				unsigned long uclamp_min, +				unsigned long uclamp_max, +				int cpu)  { -	return fits_capacity(uclamp_task_util(p), capacity); +	unsigned long capacity_orig, capacity_orig_thermal; +	unsigned long capacity = capacity_of(cpu); +	bool fits, uclamp_max_fits; + +	/* +	 * Check if the real util fits without any uclamp boost/cap applied. +	 */ +	fits = fits_capacity(util, capacity); + +	if (!uclamp_is_used()) +		return fits; + +	/* +	 * We must use capacity_orig_of() for comparing against uclamp_min and +	 * uclamp_max. We only care about capacity pressure (by using +	 * capacity_of()) for comparing against the real util. +	 * +	 * If a task is boosted to 1024 for example, we don't want a tiny +	 * pressure to skew the check whether it fits a CPU or not. +	 * +	 * Similarly if a task is capped to capacity_orig_of(little_cpu), it +	 * should fit a little cpu even if there's some pressure. +	 * +	 * Only exception is for thermal pressure since it has a direct impact +	 * on available OPP of the system. +	 * +	 * We honour it for uclamp_min only as a drop in performance level +	 * could result in not getting the requested minimum performance level. +	 * +	 * For uclamp_max, we can tolerate a drop in performance level as the +	 * goal is to cap the task. So it's okay if it's getting less. +	 * +	 * In case of capacity inversion we should honour the inverted capacity +	 * for both uclamp_min and uclamp_max all the time. +	 */ +	capacity_orig = cpu_in_capacity_inversion(cpu); +	if (capacity_orig) { +		capacity_orig_thermal = capacity_orig; +	} else { +		capacity_orig = capacity_orig_of(cpu); +		capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); +	} + +	/* +	 * We want to force a task to fit a cpu as implied by uclamp_max. +	 * But we do have some corner cases to cater for.. +	 * +	 * +	 *                                 C=z +	 *   |                             ___ +	 *   |                  C=y       |   | +	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _  uclamp_max +	 *   |      C=x        |   |      |   | +	 *   |      ___        |   |      |   | +	 *   |     |   |       |   |      |   |    (util somewhere in this region) +	 *   |     |   |       |   |      |   | +	 *   |     |   |       |   |      |   | +	 *   +---------------------------------------- +	 *         cpu0        cpu1       cpu2 +	 * +	 *   In the above example if a task is capped to a specific performance +	 *   point, y, then when: +	 * +	 *   * util = 80% of x then it does not fit on cpu0 and should migrate +	 *     to cpu1 +	 *   * util = 80% of y then it is forced to fit on cpu1 to honour +	 *     uclamp_max request. +	 * +	 *   which is what we're enforcing here. A task always fits if +	 *   uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig, +	 *   the normal upmigration rules should withhold still. +	 * +	 *   Only exception is when we are on max capacity, then we need to be +	 *   careful not to block overutilized state. This is so because: +	 * +	 *     1. There's no concept of capping at max_capacity! We can't go +	 *        beyond this performance level anyway. +	 *     2. The system is being saturated when we're operating near +	 *        max capacity, it doesn't make sense to block overutilized. +	 */ +	uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE); +	uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig); +	fits = fits || uclamp_max_fits; + +	/* +	 * +	 *                                 C=z +	 *   |                             ___       (region a, capped, util >= uclamp_max) +	 *   |                  C=y       |   | +	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max +	 *   |      C=x        |   |      |   | +	 *   |      ___        |   |      |   |      (region b, uclamp_min <= util <= uclamp_max) +	 *   |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min +	 *   |     |   |       |   |      |   | +	 *   |     |   |       |   |      |   |      (region c, boosted, util < uclamp_min) +	 *   +---------------------------------------- +	 *         cpu0        cpu1       cpu2 +	 * +	 * a) If util > uclamp_max, then we're capped, we don't care about +	 *    actual fitness value here. We only care if uclamp_max fits +	 *    capacity without taking margin/pressure into account. +	 *    See comment above. +	 * +	 * b) If uclamp_min <= util <= uclamp_max, then the normal +	 *    fits_capacity() rules apply. Except we need to ensure that we +	 *    enforce we remain within uclamp_max, see comment above. +	 * +	 * c) If util < uclamp_min, then we are boosted. Same as (b) but we +	 *    need to take into account the boosted value fits the CPU without +	 *    taking margin/pressure into account. +	 * +	 * Cases (a) and (b) are handled in the 'fits' variable already. We +	 * just need to consider an extra check for case (c) after ensuring we +	 * handle the case uclamp_min > uclamp_max. +	 */ +	uclamp_min = min(uclamp_min, uclamp_max); +	if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) +		fits = fits && (uclamp_min <= capacity_orig_thermal); + +	return fits; +} + +static inline int task_fits_cpu(struct task_struct *p, int cpu) +{ +	unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); +	unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); +	unsigned long util = task_util_est(p); +	return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);  }  static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -4442,7 +4585,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)  		return;  	} -	if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { +	if (task_fits_cpu(p, cpu_of(rq))) {  		rq->misfit_task_load = 0;  		return;  	} @@ -5862,7 +6005,10 @@ static inline void hrtick_update(struct rq *rq)  #ifdef CONFIG_SMP  static inline bool cpu_overutilized(int cpu)  { -	return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)); +	unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); +	unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + +	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);  }  static inline void update_overutilized_status(struct rq *rq) @@ -6654,21 +6800,23 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool  static int  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)  { -	unsigned long task_util, best_cap = 0; +	unsigned long task_util, util_min, util_max, best_cap = 0;  	int cpu, best_cpu = -1;  	struct cpumask *cpus;  	cpus = this_cpu_cpumask_var_ptr(select_rq_mask);  	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); -	task_util = uclamp_task_util(p); +	task_util = task_util_est(p); +	util_min = uclamp_eff_value(p, UCLAMP_MIN); +	util_max = uclamp_eff_value(p, UCLAMP_MAX);  	for_each_cpu_wrap(cpu, cpus, target) {  		unsigned long cpu_cap = capacity_of(cpu);  		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))  			continue; -		if (fits_capacity(task_util, cpu_cap)) +		if (util_fits_cpu(task_util, util_min, util_max, cpu))  			return cpu;  		if (cpu_cap > best_cap) { @@ -6680,10 +6828,13 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)  	return best_cpu;  } -static inline bool asym_fits_capacity(unsigned long task_util, int cpu) +static inline bool asym_fits_cpu(unsigned long util, +				 unsigned long util_min, +				 unsigned long util_max, +				 int cpu)  {  	if (sched_asym_cpucap_active()) -		return fits_capacity(task_util, capacity_of(cpu)); +		return util_fits_cpu(util, util_min, util_max, cpu);  	return true;  } @@ -6695,7 +6846,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  {  	bool has_idle_core = false;  	struct sched_domain *sd; -	unsigned long task_util; +	unsigned long task_util, util_min, util_max;  	int i, recent_used_cpu;  	/* @@ -6704,7 +6855,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  	 */  	if (sched_asym_cpucap_active()) {  		sync_entity_load_avg(&p->se); -		task_util = uclamp_task_util(p); +		task_util = task_util_est(p); +		util_min = uclamp_eff_value(p, UCLAMP_MIN); +		util_max = uclamp_eff_value(p, UCLAMP_MAX);  	}  	/* @@ -6713,7 +6866,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  	lockdep_assert_irqs_disabled();  	if ((available_idle_cpu(target) || sched_idle_cpu(target)) && -	    asym_fits_capacity(task_util, target)) +	    asym_fits_cpu(task_util, util_min, util_max, target))  		return target;  	/* @@ -6721,7 +6874,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  	 */  	if (prev != target && cpus_share_cache(prev, target) &&  	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) && -	    asym_fits_capacity(task_util, prev)) +	    asym_fits_cpu(task_util, util_min, util_max, prev))  		return prev;  	/* @@ -6736,7 +6889,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  	    in_task() &&  	    prev == smp_processor_id() &&  	    this_rq()->nr_running <= 1 && -	    asym_fits_capacity(task_util, prev)) { +	    asym_fits_cpu(task_util, util_min, util_max, prev)) {  		return prev;  	} @@ -6748,7 +6901,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  	    cpus_share_cache(recent_used_cpu, target) &&  	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&  	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && -	    asym_fits_capacity(task_util, recent_used_cpu)) { +	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {  		return recent_used_cpu;  	} @@ -7044,6 +7197,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  {  	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);  	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; +	unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0; +	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;  	struct root_domain *rd = this_rq()->rd;  	int cpu, best_energy_cpu, target = -1;  	struct sched_domain *sd; @@ -7068,7 +7223,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  	target = prev_cpu;  	sync_entity_load_avg(&p->se); -	if (!task_util_est(p)) +	if (!uclamp_task_util(p, p_util_min, p_util_max))  		goto unlock;  	eenv_task_busy_time(&eenv, p, prev_cpu); @@ -7076,7 +7231,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  	for (; pd; pd = pd->next) {  		unsigned long cpu_cap, cpu_thermal_cap, util;  		unsigned long cur_delta, max_spare_cap = 0; -		bool compute_prev_delta = false; +		unsigned long rq_util_min, rq_util_max; +		unsigned long util_min, util_max; +		unsigned long prev_spare_cap = 0;  		int max_spare_cap_cpu = -1;  		unsigned long base_energy; @@ -7112,26 +7269,45 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  			 * much capacity we can get out of the CPU; this is  			 * aligned with sched_cpu_util().  			 */ -			util = uclamp_rq_util_with(cpu_rq(cpu), util, p); -			if (!fits_capacity(util, cpu_cap)) +			if (uclamp_is_used()) { +				if (uclamp_rq_is_idle(cpu_rq(cpu))) { +					util_min = p_util_min; +					util_max = p_util_max; +				} else { +					/* +					 * Open code uclamp_rq_util_with() except for +					 * the clamp() part. Ie: apply max aggregation +					 * only. util_fits_cpu() logic requires to +					 * operate on non clamped util but must use the +					 * max-aggregated uclamp_{min, max}. +					 */ +					rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); +					rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + +					util_min = max(rq_util_min, p_util_min); +					util_max = max(rq_util_max, p_util_max); +				} +			} +			if (!util_fits_cpu(util, util_min, util_max, cpu))  				continue;  			lsub_positive(&cpu_cap, util);  			if (cpu == prev_cpu) {  				/* Always use prev_cpu as a candidate. */ -				compute_prev_delta = true; +				prev_spare_cap = cpu_cap;  			} else if (cpu_cap > max_spare_cap) {  				/*  				 * Find the CPU with the maximum spare capacity -				 * in the performance domain. +				 * among the remaining CPUs in the performance +				 * domain.  				 */  				max_spare_cap = cpu_cap;  				max_spare_cap_cpu = cpu;  			}  		} -		if (max_spare_cap_cpu < 0 && !compute_prev_delta) +		if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)  			continue;  		eenv_pd_busy_time(&eenv, cpus, p); @@ -7139,7 +7315,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  		base_energy = compute_energy(&eenv, pd, cpus, p, -1);  		/* Evaluate the energy impact of using prev_cpu. */ -		if (compute_prev_delta) { +		if (prev_spare_cap > 0) {  			prev_delta = compute_energy(&eenv, pd, cpus, p,  						    prev_cpu);  			/* CPU utilization has changed */ @@ -7150,7 +7326,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  		}  		/* Evaluate the energy impact of using max_spare_cap_cpu. */ -		if (max_spare_cap_cpu >= 0) { +		if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {  			cur_delta = compute_energy(&eenv, pd, cpus, p,  						   max_spare_cap_cpu);  			/* CPU utilization has changed */ @@ -8276,7 +8452,7 @@ static int detach_tasks(struct lb_env *env)  		case migrate_misfit:  			/* This is not a misfit task */ -			if (task_fits_capacity(p, capacity_of(env->src_cpu))) +			if (task_fits_cpu(p, env->src_cpu))  				goto next;  			env->imbalance = 0; @@ -8665,16 +8841,73 @@ static unsigned long scale_rt_capacity(int cpu)  static void update_cpu_capacity(struct sched_domain *sd, int cpu)  { +	unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);  	unsigned long capacity = scale_rt_capacity(cpu);  	struct sched_group *sdg = sd->groups; +	struct rq *rq = cpu_rq(cpu); -	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); +	rq->cpu_capacity_orig = capacity_orig;  	if (!capacity)  		capacity = 1; -	cpu_rq(cpu)->cpu_capacity = capacity; -	trace_sched_cpu_capacity_tp(cpu_rq(cpu)); +	rq->cpu_capacity = capacity; + +	/* +	 * Detect if the performance domain is in capacity inversion state. +	 * +	 * Capacity inversion happens when another perf domain with equal or +	 * lower capacity_orig_of() ends up having higher capacity than this +	 * domain after subtracting thermal pressure. +	 * +	 * We only take into account thermal pressure in this detection as it's +	 * the only metric that actually results in *real* reduction of +	 * capacity due to performance points (OPPs) being dropped/become +	 * unreachable due to thermal throttling. +	 * +	 * We assume: +	 *   * That all cpus in a perf domain have the same capacity_orig +	 *     (same uArch). +	 *   * Thermal pressure will impact all cpus in this perf domain +	 *     equally. +	 */ +	if (static_branch_unlikely(&sched_asym_cpucapacity)) { +		unsigned long inv_cap = capacity_orig - thermal_load_avg(rq); +		struct perf_domain *pd = rcu_dereference(rq->rd->pd); + +		rq->cpu_capacity_inverted = 0; + +		for (; pd; pd = pd->next) { +			struct cpumask *pd_span = perf_domain_span(pd); +			unsigned long pd_cap_orig, pd_cap; + +			cpu = cpumask_any(pd_span); +			pd_cap_orig = arch_scale_cpu_capacity(cpu); + +			if (capacity_orig < pd_cap_orig) +				continue; + +			/* +			 * handle the case of multiple perf domains have the +			 * same capacity_orig but one of them is under higher +			 * thermal pressure. We record it as capacity +			 * inversion. +			 */ +			if (capacity_orig == pd_cap_orig) { +				pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu)); + +				if (pd_cap > inv_cap) { +					rq->cpu_capacity_inverted = inv_cap; +					break; +				} +			} else if (pd_cap_orig > inv_cap) { +				rq->cpu_capacity_inverted = inv_cap; +				break; +			} +		} +	} + +	trace_sched_cpu_capacity_tp(rq);  	sdg->sgc->capacity = capacity;  	sdg->sgc->min_capacity = capacity; @@ -9281,6 +9514,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,  	memset(sgs, 0, sizeof(*sgs)); +	/* Assume that task can't fit any CPU of the group */ +	if (sd->flags & SD_ASYM_CPUCAPACITY) +		sgs->group_misfit_task_load = 1; +  	for_each_cpu(i, sched_group_span(group)) {  		struct rq *rq = cpu_rq(i);  		unsigned int local; @@ -9300,12 +9537,12 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,  		if (!nr_running && idle_cpu_without(i, p))  			sgs->idle_cpus++; -	} +		/* Check if task fits in the CPU */ +		if (sd->flags & SD_ASYM_CPUCAPACITY && +		    sgs->group_misfit_task_load && +		    task_fits_cpu(p, i)) +			sgs->group_misfit_task_load = 0; -	/* Check if task fits in the group */ -	if (sd->flags & SD_ASYM_CPUCAPACITY && -	    !task_fits_capacity(p, group->sgc->max_capacity)) { -		sgs->group_misfit_task_load = 1;  	}  	sgs->group_capacity = group->sgc->capacity; |