diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 794 | 
1 files changed, 519 insertions, 275 deletions
| diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..c242944f5cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;  unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;  #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * 1024 < capacity * margin + */ +unsigned int capacity_margin = 1280; /* ~20% */ +  static inline void update_load_add(struct load_weight *lw, unsigned long inc)  {  	lw->weight += inc; @@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)  static inline struct task_struct *task_of(struct sched_entity *se)  { -#ifdef CONFIG_SCHED_DEBUG -	WARN_ON_ONCE(!entity_is_task(se)); -#endif +	SCHED_WARN_ON(!entity_is_task(se));  	return container_of(se, struct task_struct, se);  } @@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a,  static void update_min_vruntime(struct cfs_rq *cfs_rq)  { +	struct sched_entity *curr = cfs_rq->curr; +  	u64 vruntime = cfs_rq->min_vruntime; -	if (cfs_rq->curr) -		vruntime = cfs_rq->curr->vruntime; +	if (curr) { +		if (curr->on_rq) +			vruntime = curr->vruntime; +		else +			curr = NULL; +	}  	if (cfs_rq->rb_leftmost) {  		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,  						   struct sched_entity,  						   run_node); -		if (!cfs_rq->curr) +		if (!curr)  			vruntime = se->vruntime;  		else  			vruntime = min_vruntime(vruntime, se->vruntime); @@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)  }  #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);  static unsigned long task_h_load(struct task_struct *p);  /* @@ -680,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)  	 * will definitely be update (after enqueue).  	 */  	sa->period_contrib = 1023; -	sa->load_avg = scale_load_down(se->load.weight); +	/* +	 * Tasks are intialized with full load to be seen as heavy tasks until +	 * they get a chance to stabilize to their real load level. +	 * Group entities are intialized with zero load to reflect the fact that +	 * nothing has been attached to the task group yet. +	 */ +	if (entity_is_task(se)) +		sa->load_avg = scale_load_down(se->load.weight);  	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;  	/*  	 * At this point, util_avg won't be used in select_task_rq_fair anyway @@ -726,7 +743,6 @@ void post_init_entity_util_avg(struct sched_entity *se)  	struct sched_avg *sa = &se->avg;  	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;  	u64 now = cfs_rq_clock_task(cfs_rq); -	int tg_update;  	if (cap > 0) {  		if (cfs_rq->avg.util_avg != 0) { @@ -759,10 +775,9 @@ void post_init_entity_util_avg(struct sched_entity *se)  		}  	} -	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); +	update_cfs_rq_load_avg(now, cfs_rq, false);  	attach_entity_load_avg(cfs_rq, se); -	if (tg_update) -		update_tg_load_avg(cfs_rq, false); +	update_tg_load_avg(cfs_rq, false);  }  #else /* !CONFIG_SMP */ @@ -799,7 +814,7 @@ static void update_curr(struct cfs_rq *cfs_rq)  		      max(delta_exec, curr->statistics.exec_max));  	curr->sum_exec_runtime += delta_exec; -	schedstat_add(cfs_rq, exec_clock, delta_exec); +	schedstat_add(cfs_rq->exec_clock, delta_exec);  	curr->vruntime += calc_delta_fair(delta_exec, curr);  	update_min_vruntime(cfs_rq); @@ -820,26 +835,34 @@ static void update_curr_fair(struct rq *rq)  	update_curr(cfs_rq_of(&rq->curr->se));  } -#ifdef CONFIG_SCHEDSTATS  static inline void  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)  { -	u64 wait_start = rq_clock(rq_of(cfs_rq)); +	u64 wait_start, prev_wait_start; + +	if (!schedstat_enabled()) +		return; + +	wait_start = rq_clock(rq_of(cfs_rq)); +	prev_wait_start = schedstat_val(se->statistics.wait_start);  	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && -	    likely(wait_start > se->statistics.wait_start)) -		wait_start -= se->statistics.wait_start; +	    likely(wait_start > prev_wait_start)) +		wait_start -= prev_wait_start; -	se->statistics.wait_start = wait_start; +	schedstat_set(se->statistics.wait_start, wait_start);  } -static void +static inline void  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  	struct task_struct *p;  	u64 delta; -	delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; +	if (!schedstat_enabled()) +		return; + +	delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);  	if (entity_is_task(se)) {  		p = task_of(se); @@ -849,35 +872,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)  			 * time stamp can be adjusted to accumulate wait time  			 * prior to migration.  			 */ -			se->statistics.wait_start = delta; +			schedstat_set(se->statistics.wait_start, delta);  			return;  		}  		trace_sched_stat_wait(p, delta);  	} -	se->statistics.wait_max = max(se->statistics.wait_max, delta); -	se->statistics.wait_count++; -	se->statistics.wait_sum += delta; -	se->statistics.wait_start = 0; +	schedstat_set(se->statistics.wait_max, +		      max(schedstat_val(se->statistics.wait_max), delta)); +	schedstat_inc(se->statistics.wait_count); +	schedstat_add(se->statistics.wait_sum, delta); +	schedstat_set(se->statistics.wait_start, 0); +} + +static inline void +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	struct task_struct *tsk = NULL; +	u64 sleep_start, block_start; + +	if (!schedstat_enabled()) +		return; + +	sleep_start = schedstat_val(se->statistics.sleep_start); +	block_start = schedstat_val(se->statistics.block_start); + +	if (entity_is_task(se)) +		tsk = task_of(se); + +	if (sleep_start) { +		u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; + +		if ((s64)delta < 0) +			delta = 0; + +		if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) +			schedstat_set(se->statistics.sleep_max, delta); + +		schedstat_set(se->statistics.sleep_start, 0); +		schedstat_add(se->statistics.sum_sleep_runtime, delta); + +		if (tsk) { +			account_scheduler_latency(tsk, delta >> 10, 1); +			trace_sched_stat_sleep(tsk, delta); +		} +	} +	if (block_start) { +		u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; + +		if ((s64)delta < 0) +			delta = 0; + +		if (unlikely(delta > schedstat_val(se->statistics.block_max))) +			schedstat_set(se->statistics.block_max, delta); + +		schedstat_set(se->statistics.block_start, 0); +		schedstat_add(se->statistics.sum_sleep_runtime, delta); + +		if (tsk) { +			if (tsk->in_iowait) { +				schedstat_add(se->statistics.iowait_sum, delta); +				schedstat_inc(se->statistics.iowait_count); +				trace_sched_stat_iowait(tsk, delta); +			} + +			trace_sched_stat_blocked(tsk, delta); + +			/* +			 * Blocking time is in units of nanosecs, so shift by +			 * 20 to get a milliseconds-range estimation of the +			 * amount of time that the task spent sleeping: +			 */ +			if (unlikely(prof_on == SLEEP_PROFILING)) { +				profile_hits(SLEEP_PROFILING, +						(void *)get_wchan(tsk), +						delta >> 20); +			} +			account_scheduler_latency(tsk, delta >> 10, 0); +		} +	}  }  /*   * Task is being enqueued - update stats:   */  static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  { +	if (!schedstat_enabled()) +		return; +  	/*  	 * Are we enqueueing a waiting task? (for current tasks  	 * a dequeue/enqueue event is a NOP)  	 */  	if (se != cfs_rq->curr)  		update_stats_wait_start(cfs_rq, se); + +	if (flags & ENQUEUE_WAKEUP) +		update_stats_enqueue_sleeper(cfs_rq, se);  }  static inline void  update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  { + +	if (!schedstat_enabled()) +		return; +  	/*  	 * Mark the end of the wait period if dequeueing a  	 * waiting task: @@ -885,40 +987,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	if (se != cfs_rq->curr)  		update_stats_wait_end(cfs_rq, se); -	if (flags & DEQUEUE_SLEEP) { -		if (entity_is_task(se)) { -			struct task_struct *tsk = task_of(se); +	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { +		struct task_struct *tsk = task_of(se); -			if (tsk->state & TASK_INTERRUPTIBLE) -				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); -			if (tsk->state & TASK_UNINTERRUPTIBLE) -				se->statistics.block_start = rq_clock(rq_of(cfs_rq)); -		} +		if (tsk->state & TASK_INTERRUPTIBLE) +			schedstat_set(se->statistics.sleep_start, +				      rq_clock(rq_of(cfs_rq))); +		if (tsk->state & TASK_UNINTERRUPTIBLE) +			schedstat_set(se->statistics.block_start, +				      rq_clock(rq_of(cfs_rq)));  	} - -} -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{  } -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ -} -#endif -  /*   * We are picking a new current task - update its stats:   */ @@ -1513,8 +1593,16 @@ balance:  	 * One idle CPU per node is evaluated for a task numa move.  	 * Call select_idle_sibling to maybe find a better one.  	 */ -	if (!cur) -		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); +	if (!cur) { +		/* +		 * select_idle_siblings() uses an per-cpu cpumask that +		 * can be used from IRQ context. +		 */ +		local_irq_disable(); +		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, +						   env->dst_cpu); +		local_irq_enable(); +	}  assign:  	task_numa_assign(env, cur, imp); @@ -2292,7 +2380,7 @@ void task_numa_work(struct callback_head *work)  	unsigned long nr_pte_updates = 0;  	long pages, virtpages; -	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); +	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));  	work->next = work; /* protect against double add */  	/* @@ -2803,9 +2891,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,  }  #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * Updating tg's load_avg is necessary before update_cfs_share (which is done) - * and effective_load (which is not done because it is too costly). +/** + * update_tg_load_avg - update the tg's load avg + * @cfs_rq: the cfs_rq whose avg changed + * @force: update regardless of how small the difference + * + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. + * However, because tg->load_avg is a global value there are performance + * considerations. + * + * In order to avoid having to look at the other cfs_rq's, we use a + * differential update where we store the last value we propagated. This in + * turn allows skipping updates if the differential is 'small'. + * + * Updating tg's load_avg is necessary before update_cfs_share() (which is + * done) and effective_load() (which is not done because it is too costly).   */  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)  { @@ -2875,12 +2975,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}  static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)  { -	struct rq *rq = rq_of(cfs_rq); -	int cpu = cpu_of(rq); - -	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { -		unsigned long max = rq->cpu_capacity_orig; - +	if (&this_rq()->cfs == cfs_rq) {  		/*  		 * There are a few boundary cases this might miss but it should  		 * get called often enough that that should (hopefully) not be @@ -2897,8 +2992,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)  		 *  		 * See cpu_util().  		 */ -		cpufreq_update_util(rq_clock(rq), -				    min(cfs_rq->avg.util_avg, max), max); +		cpufreq_update_util(rq_of(cfs_rq), 0);  	}  } @@ -2931,10 +3025,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)   *   * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.   * - * Returns true if the load decayed or we removed utilization. It is expected - * that one calls update_tg_load_avg() on this condition, but after you've - * modified the cfs_rq avg (attach/detach), such that we propagate the new - * avg up. + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true.   */  static inline int  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -3159,10 +3253,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)  static inline void update_load_avg(struct sched_entity *se, int not_used)  { -	struct cfs_rq *cfs_rq = cfs_rq_of(se); -	struct rq *rq = rq_of(cfs_rq); - -	cpufreq_trigger_update(rq_clock(rq)); +	cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);  }  static inline void @@ -3183,68 +3274,6 @@ static inline int idle_balance(struct rq *rq)  #endif /* CONFIG_SMP */ -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHEDSTATS -	struct task_struct *tsk = NULL; - -	if (entity_is_task(se)) -		tsk = task_of(se); - -	if (se->statistics.sleep_start) { -		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; - -		if ((s64)delta < 0) -			delta = 0; - -		if (unlikely(delta > se->statistics.sleep_max)) -			se->statistics.sleep_max = delta; - -		se->statistics.sleep_start = 0; -		se->statistics.sum_sleep_runtime += delta; - -		if (tsk) { -			account_scheduler_latency(tsk, delta >> 10, 1); -			trace_sched_stat_sleep(tsk, delta); -		} -	} -	if (se->statistics.block_start) { -		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; - -		if ((s64)delta < 0) -			delta = 0; - -		if (unlikely(delta > se->statistics.block_max)) -			se->statistics.block_max = delta; - -		se->statistics.block_start = 0; -		se->statistics.sum_sleep_runtime += delta; - -		if (tsk) { -			if (tsk->in_iowait) { -				se->statistics.iowait_sum += delta; -				se->statistics.iowait_count++; -				trace_sched_stat_iowait(tsk, delta); -			} - -			trace_sched_stat_blocked(tsk, delta); - -			/* -			 * Blocking time is in units of nanosecs, so shift by -			 * 20 to get a milliseconds-range estimation of the -			 * amount of time that the task spent sleeping: -			 */ -			if (unlikely(prof_on == SLEEP_PROFILING)) { -				profile_hits(SLEEP_PROFILING, -						(void *)get_wchan(tsk), -						delta >> 20); -			} -			account_scheduler_latency(tsk, delta >> 10, 0); -		} -	} -#endif -} -  static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  #ifdef CONFIG_SCHED_DEBUG @@ -3254,7 +3283,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)  		d = -d;  	if (d > 3*sysctl_sched_latency) -		schedstat_inc(cfs_rq, nr_spread_over); +		schedstat_inc(cfs_rq->nr_spread_over);  #endif  } @@ -3371,17 +3400,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	account_entity_enqueue(cfs_rq, se);  	update_cfs_shares(cfs_rq); -	if (flags & ENQUEUE_WAKEUP) { +	if (flags & ENQUEUE_WAKEUP)  		place_entity(cfs_rq, se, 0); -		if (schedstat_enabled()) -			enqueue_sleeper(cfs_rq, se); -	}  	check_schedstat_required(); -	if (schedstat_enabled()) { -		update_stats_enqueue(cfs_rq, se); -		check_spread(cfs_rq, se); -	} +	update_stats_enqueue(cfs_rq, se, flags); +	check_spread(cfs_rq, se);  	if (!curr)  		__enqueue_entity(cfs_rq, se);  	se->on_rq = 1; @@ -3448,8 +3472,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	update_curr(cfs_rq);  	dequeue_entity_load_avg(cfs_rq, se); -	if (schedstat_enabled()) -		update_stats_dequeue(cfs_rq, se, flags); +	update_stats_dequeue(cfs_rq, se, flags);  	clear_buddies(cfs_rq, se); @@ -3459,9 +3482,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	account_entity_dequeue(cfs_rq, se);  	/* -	 * Normalize the entity after updating the min_vruntime because the -	 * update can refer to the ->curr item and we need to reflect this -	 * movement in our normalized position. +	 * Normalize after update_curr(); which will also have moved +	 * min_vruntime if @se is the one holding it back. But before doing +	 * update_min_vruntime() again, which will discount @se's position and +	 * can move min_vruntime forward still more.  	 */  	if (!(flags & DEQUEUE_SLEEP))  		se->vruntime -= cfs_rq->min_vruntime; @@ -3469,8 +3493,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	/* return excess runtime on last dequeue */  	return_cfs_rq_runtime(cfs_rq); -	update_min_vruntime(cfs_rq);  	update_cfs_shares(cfs_rq); + +	/* +	 * Now advance min_vruntime if @se was the entity holding it back, +	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be +	 * put back on, and if we advance min_vruntime, we'll be placed back +	 * further than we started -- ie. we'll be penalized. +	 */ +	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) +		update_min_vruntime(cfs_rq);  }  /* @@ -3523,25 +3555,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)  		 * a CPU. So account for the time it spent waiting on the  		 * runqueue.  		 */ -		if (schedstat_enabled()) -			update_stats_wait_end(cfs_rq, se); +		update_stats_wait_end(cfs_rq, se);  		__dequeue_entity(cfs_rq, se);  		update_load_avg(se, 1);  	}  	update_stats_curr_start(cfs_rq, se);  	cfs_rq->curr = se; -#ifdef CONFIG_SCHEDSTATS +  	/*  	 * Track our maximum slice length, if the CPU's load is at  	 * least twice that of our own weight (i.e. dont track it  	 * when there are only lesser-weight tasks around):  	 */  	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { -		se->statistics.slice_max = max(se->statistics.slice_max, -			se->sum_exec_runtime - se->prev_sum_exec_runtime); +		schedstat_set(se->statistics.slice_max, +			max((u64)schedstat_val(se->statistics.slice_max), +			    se->sum_exec_runtime - se->prev_sum_exec_runtime));  	} -#endif +  	se->prev_sum_exec_runtime = se->sum_exec_runtime;  } @@ -3620,13 +3652,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)  	/* throttle cfs_rqs exceeding runtime */  	check_cfs_rq_runtime(cfs_rq); -	if (schedstat_enabled()) { -		check_spread(cfs_rq, prev); -		if (prev->on_rq) -			update_stats_wait_start(cfs_rq, prev); -	} +	check_spread(cfs_rq, prev);  	if (prev->on_rq) { +		update_stats_wait_start(cfs_rq, prev);  		/* Put 'current' back into the tree. */  		__enqueue_entity(cfs_rq, prev);  		/* in !on_rq case, update occurred at dequeue */ @@ -4456,9 +4485,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)  	struct sched_entity *se = &p->se;  	struct cfs_rq *cfs_rq = cfs_rq_of(se); -	WARN_ON(task_rq(p) != rq); +	SCHED_WARN_ON(task_rq(p) != rq); -	if (cfs_rq->nr_running > 1) { +	if (rq->cfs.h_nr_running > 1) {  		u64 slice = sched_slice(cfs_rq, se);  		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;  		s64 delta = slice - ran; @@ -4509,6 +4538,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	struct cfs_rq *cfs_rq;  	struct sched_entity *se = &p->se; +	/* +	 * If in_iowait is set, the code below may not trigger any cpufreq +	 * utilization updates, so do it here explicitly with the IOWAIT flag +	 * passed. +	 */ +	if (p->in_iowait) +		cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); +  	for_each_sched_entity(se) {  		if (se->on_rq)  			break; @@ -4605,6 +4642,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  }  #ifdef CONFIG_SMP + +/* Working cpumask for: load_balance, load_balance_newidle. */ +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +  #ifdef CONFIG_NO_HZ_COMMON  /*   * per rq 'load' arrray crap; XXX kill this. @@ -5006,9 +5048,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)  		 * wl = S * s'_i; see (2)  		 */  		if (W > 0 && w < W) -			wl = (w * (long)tg->shares) / W; +			wl = (w * (long)scale_load_down(tg->shares)) / W;  		else -			wl = tg->shares; +			wl = scale_load_down(tg->shares);  		/*  		 * Per the above, wl is the new se->load.weight value; since @@ -5091,18 +5133,18 @@ static int wake_wide(struct task_struct *p)  	return 1;  } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, +		       int prev_cpu, int sync)  {  	s64 this_load, load;  	s64 this_eff_load, prev_eff_load; -	int idx, this_cpu, prev_cpu; +	int idx, this_cpu;  	struct task_group *tg;  	unsigned long weight;  	int balanced;  	idx	  = sd->wake_idx;  	this_cpu  = smp_processor_id(); -	prev_cpu  = task_cpu(p);  	load	  = source_load(prev_cpu, idx);  	this_load = target_load(this_cpu, idx); @@ -5146,13 +5188,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)  	balanced = this_eff_load <= prev_eff_load; -	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); +	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);  	if (!balanced)  		return 0; -	schedstat_inc(sd, ttwu_move_affine); -	schedstat_inc(p, se.statistics.nr_wakeups_affine); +	schedstat_inc(sd->ttwu_move_affine); +	schedstat_inc(p->se.statistics.nr_wakeups_affine);  	return 1;  } @@ -5228,6 +5270,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)  	int shallowest_idle_cpu = -1;  	int i; +	/* Check if we have any choice: */ +	if (group->group_weight == 1) +		return cpumask_first(sched_group_cpus(group)); +  	/* Traverse only the allowed CPUs */  	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {  		if (idle_cpu(i)) { @@ -5265,64 +5311,242 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)  }  /* - * Try and locate an idle CPU in the sched_domain. + * Implement a for_each_cpu() variant that starts the scan at a given cpu + * (@start), and wraps around. + * + * This is used to scan for idle CPUs; such that not all CPUs looking for an + * idle CPU find the same CPU. The down-side is that tasks tend to cycle + * through the LLC domain. + * + * Especially tbench is found sensitive to this. + */ + +static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) +{ +	int next; + +again: +	next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); + +	if (*wrapped) { +		if (next >= start) +			return nr_cpumask_bits; +	} else { +		if (next >= nr_cpumask_bits) { +			*wrapped = 1; +			n = -1; +			goto again; +		} +	} + +	return next; +} + +#define for_each_cpu_wrap(cpu, mask, start, wrap)				\ +	for ((wrap) = 0, (cpu) = (start)-1;					\ +		(cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),	\ +		(cpu) < nr_cpumask_bits; ) + +#ifdef CONFIG_SCHED_SMT + +static inline void set_idle_cores(int cpu, int val) +{ +	struct sched_domain_shared *sds; + +	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); +	if (sds) +		WRITE_ONCE(sds->has_idle_cores, val); +} + +static inline bool test_idle_cores(int cpu, bool def) +{ +	struct sched_domain_shared *sds; + +	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); +	if (sds) +		return READ_ONCE(sds->has_idle_cores); + +	return def; +} + +/* + * Scans the local SMT mask to see if the entire core is idle, and records this + * information in sd_llc_shared->has_idle_cores. + * + * Since SMT siblings share all cache levels, inspecting this limited remote + * state should be fairly cheap. + */ +void __update_idle_core(struct rq *rq) +{ +	int core = cpu_of(rq); +	int cpu; + +	rcu_read_lock(); +	if (test_idle_cores(core, true)) +		goto unlock; + +	for_each_cpu(cpu, cpu_smt_mask(core)) { +		if (cpu == core) +			continue; + +		if (!idle_cpu(cpu)) +			goto unlock; +	} + +	set_idle_cores(core, 1); +unlock: +	rcu_read_unlock(); +} + +/* + * Scan the entire LLC domain for idle cores; this dynamically switches off if + * there are no idle cores left in the system; tracked through + * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. + */ +static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +{ +	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); +	int core, cpu, wrap; + +	if (!static_branch_likely(&sched_smt_present)) +		return -1; + +	if (!test_idle_cores(target, false)) +		return -1; + +	cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); + +	for_each_cpu_wrap(core, cpus, target, wrap) { +		bool idle = true; + +		for_each_cpu(cpu, cpu_smt_mask(core)) { +			cpumask_clear_cpu(cpu, cpus); +			if (!idle_cpu(cpu)) +				idle = false; +		} + +		if (idle) +			return core; +	} + +	/* +	 * Failed to find an idle core; stop looking for one. +	 */ +	set_idle_cores(target, 0); + +	return -1; +} + +/* + * Scan the local SMT mask for idle CPUs. + */ +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ +	int cpu; + +	if (!static_branch_likely(&sched_smt_present)) +		return -1; + +	for_each_cpu(cpu, cpu_smt_mask(target)) { +		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) +			continue; +		if (idle_cpu(cpu)) +			return cpu; +	} + +	return -1; +} + +#else /* CONFIG_SCHED_SMT */ + +static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +{ +	return -1; +} + +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ +	return -1; +} + +#endif /* CONFIG_SCHED_SMT */ + +/* + * Scan the LLC domain for idle CPUs; this is dynamically regulated by + * comparing the average scan cost (tracked in sd->avg_scan_cost) against the + * average idle time for this rq (as found in rq->avg_idle). + */ +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) +{ +	struct sched_domain *this_sd; +	u64 avg_cost, avg_idle = this_rq()->avg_idle; +	u64 time, cost; +	s64 delta; +	int cpu, wrap; + +	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); +	if (!this_sd) +		return -1; + +	avg_cost = this_sd->avg_scan_cost; + +	/* +	 * Due to large variance we need a large fuzz factor; hackbench in +	 * particularly is sensitive here. +	 */ +	if ((avg_idle / 512) < avg_cost) +		return -1; + +	time = local_clock(); + +	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { +		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) +			continue; +		if (idle_cpu(cpu)) +			break; +	} + +	time = local_clock() - time; +	cost = this_sd->avg_scan_cost; +	delta = (s64)(time - cost) / 8; +	this_sd->avg_scan_cost += delta; + +	return cpu; +} + +/* + * Try and locate an idle core/thread in the LLC cache domain.   */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_sibling(struct task_struct *p, int prev, int target)  {  	struct sched_domain *sd; -	struct sched_group *sg; -	int i = task_cpu(p); +	int i;  	if (idle_cpu(target))  		return target;  	/* -	 * If the prevous cpu is cache affine and idle, don't be stupid. +	 * If the previous cpu is cache affine and idle, don't be stupid.  	 */ -	if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) -		return i; +	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) +		return prev; -	/* -	 * Otherwise, iterate the domains and find an eligible idle cpu. -	 * -	 * A completely idle sched group at higher domains is more -	 * desirable than an idle group at a lower level, because lower -	 * domains have smaller groups and usually share hardware -	 * resources which causes tasks to contend on them, e.g. x86 -	 * hyperthread siblings in the lowest domain (SMT) can contend -	 * on the shared cpu pipeline. -	 * -	 * However, while we prefer idle groups at higher domains -	 * finding an idle cpu at the lowest domain is still better than -	 * returning 'target', which we've already established, isn't -	 * idle. -	 */  	sd = rcu_dereference(per_cpu(sd_llc, target)); -	for_each_lower_domain(sd) { -		sg = sd->groups; -		do { -			if (!cpumask_intersects(sched_group_cpus(sg), -						tsk_cpus_allowed(p))) -				goto next; - -			/* Ensure the entire group is idle */ -			for_each_cpu(i, sched_group_cpus(sg)) { -				if (i == target || !idle_cpu(i)) -					goto next; -			} +	if (!sd) +		return target; + +	i = select_idle_core(p, sd, target); +	if ((unsigned)i < nr_cpumask_bits) +		return i; + +	i = select_idle_cpu(p, sd, target); +	if ((unsigned)i < nr_cpumask_bits) +		return i; + +	i = select_idle_smt(p, sd, target); +	if ((unsigned)i < nr_cpumask_bits) +		return i; -			/* -			 * It doesn't matter which cpu we pick, the -			 * whole group is idle. -			 */ -			target = cpumask_first_and(sched_group_cpus(sg), -					tsk_cpus_allowed(p)); -			goto done; -next: -			sg = sg->next; -		} while (sg != sd->groups); -	} -done:  	return target;  } @@ -5360,6 +5584,32 @@ static int cpu_util(int cpu)  	return (util >= capacity) ? capacity : util;  } +static inline int task_util(struct task_struct *p) +{ +	return p->se.avg.util_avg; +} + +/* + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. + * + * In that case WAKE_AFFINE doesn't make sense and we'll let + * BALANCE_WAKE sort things out. + */ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) +{ +	long min_cap, max_cap; + +	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); +	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; + +	/* Minimum capacity is close to max, no need to abort wake_affine */ +	if (max_cap - min_cap < max_cap >> 3) +		return 0; + +	return min_cap * 1024 < task_util(p) * capacity_margin; +} +  /*   * select_task_rq_fair: Select target runqueue for the waking task in domains   * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5383,7 +5633,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f  	if (sd_flag & SD_BALANCE_WAKE) {  		record_wakee(p); -		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); +		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) +			      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));  	}  	rcu_read_lock(); @@ -5409,13 +5660,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f  	if (affine_sd) {  		sd = NULL; /* Prefer wake_affine over balance flags */ -		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) +		if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))  			new_cpu = cpu;  	}  	if (!sd) {  		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ -			new_cpu = select_idle_sibling(p, new_cpu); +			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);  	} else while (sd) {  		struct sched_group *group; @@ -5939,7 +6190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp   *   * The adjacency matrix of the resulting graph is given by:   * - *             log_2 n      + *             log_2 n   *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)   *             k = 0   * @@ -5985,7 +6236,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp   *   * [XXX write more on how we solve this.. _after_ merging pjt's patches that   *      rewrite all of this once again.] - */  + */  static unsigned long __read_mostly max_load_balance_interval = HZ/10; @@ -6133,7 +6384,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {  		int cpu; -		schedstat_inc(p, se.statistics.nr_failed_migrations_affine); +		schedstat_inc(p->se.statistics.nr_failed_migrations_affine);  		env->flags |= LBF_SOME_PINNED; @@ -6164,7 +6415,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  	env->flags &= ~LBF_ALL_PINNED;  	if (task_running(env->src_rq, p)) { -		schedstat_inc(p, se.statistics.nr_failed_migrations_running); +		schedstat_inc(p->se.statistics.nr_failed_migrations_running);  		return 0;  	} @@ -6181,13 +6432,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  	if (tsk_cache_hot <= 0 ||  	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {  		if (tsk_cache_hot == 1) { -			schedstat_inc(env->sd, lb_hot_gained[env->idle]); -			schedstat_inc(p, se.statistics.nr_forced_migrations); +			schedstat_inc(env->sd->lb_hot_gained[env->idle]); +			schedstat_inc(p->se.statistics.nr_forced_migrations);  		}  		return 1;  	} -	schedstat_inc(p, se.statistics.nr_failed_migrations_hot); +	schedstat_inc(p->se.statistics.nr_failed_migrations_hot);  	return 0;  } @@ -6227,7 +6478,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)  		 * so we can safely collect stats here rather than  		 * inside detach_tasks().  		 */ -		schedstat_inc(env->sd, lb_gained[env->idle]); +		schedstat_inc(env->sd->lb_gained[env->idle]);  		return p;  	}  	return NULL; @@ -6319,7 +6570,7 @@ next:  	 * so we can safely collect detach_one_task() stats here rather  	 * than inside detach_one_task().  	 */ -	schedstat_add(env->sd, lb_gained[env->idle], detached); +	schedstat_add(env->sd->lb_gained[env->idle], detached);  	return detached;  } @@ -6647,7 +6898,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)  		/*  		 * !SD_OVERLAP domains can assume that child groups  		 * span the current group. -		 */  +		 */  		group = child->groups;  		do { @@ -7147,7 +7398,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s  		load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;  		if (load_above_capacity > busiest->group_capacity) {  			load_above_capacity -= busiest->group_capacity; -			load_above_capacity *= NICE_0_LOAD; +			load_above_capacity *= scale_load_down(NICE_0_LOAD);  			load_above_capacity /= busiest->group_capacity;  		} else  			load_above_capacity = ~0UL; @@ -7354,9 +7605,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,   */  #define MAX_PINNED_INTERVAL	512 -/* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -  static int need_active_balance(struct lb_env *env)  {  	struct sched_domain *sd = env->sd; @@ -7460,7 +7708,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,  	cpumask_copy(cpus, cpu_active_mask); -	schedstat_inc(sd, lb_count[idle]); +	schedstat_inc(sd->lb_count[idle]);  redo:  	if (!should_we_balance(&env)) { @@ -7470,19 +7718,19 @@ redo:  	group = find_busiest_group(&env);  	if (!group) { -		schedstat_inc(sd, lb_nobusyg[idle]); +		schedstat_inc(sd->lb_nobusyg[idle]);  		goto out_balanced;  	}  	busiest = find_busiest_queue(&env, group);  	if (!busiest) { -		schedstat_inc(sd, lb_nobusyq[idle]); +		schedstat_inc(sd->lb_nobusyq[idle]);  		goto out_balanced;  	}  	BUG_ON(busiest == env.dst_rq); -	schedstat_add(sd, lb_imbalance[idle], env.imbalance); +	schedstat_add(sd->lb_imbalance[idle], env.imbalance);  	env.src_cpu = busiest->cpu;  	env.src_rq = busiest; @@ -7589,7 +7837,7 @@ more_balance:  	}  	if (!ld_moved) { -		schedstat_inc(sd, lb_failed[idle]); +		schedstat_inc(sd->lb_failed[idle]);  		/*  		 * Increment the failure counter only on periodic balance.  		 * We do not want newidle balance, which can be very @@ -7672,7 +7920,7 @@ out_all_pinned:  	 * we can't migrate them. Let the imbalance flag set so parent level  	 * can try to migrate them.  	 */ -	schedstat_inc(sd, lb_balanced[idle]); +	schedstat_inc(sd->lb_balanced[idle]);  	sd->nr_balance_failed = 0; @@ -7704,11 +7952,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)  }  static inline void -update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +update_next_balance(struct sched_domain *sd, unsigned long *next_balance)  {  	unsigned long interval, next; -	interval = get_sd_balance_interval(sd, cpu_busy); +	/* used by idle balance, so cpu_busy = 0 */ +	interval = get_sd_balance_interval(sd, 0);  	next = sd->last_balance + interval;  	if (time_after(*next_balance, next)) @@ -7738,7 +7987,7 @@ static int idle_balance(struct rq *this_rq)  		rcu_read_lock();  		sd = rcu_dereference_check_sched_domain(this_rq->sd);  		if (sd) -			update_next_balance(sd, 0, &next_balance); +			update_next_balance(sd, &next_balance);  		rcu_read_unlock();  		goto out; @@ -7756,7 +8005,7 @@ static int idle_balance(struct rq *this_rq)  			continue;  		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { -			update_next_balance(sd, 0, &next_balance); +			update_next_balance(sd, &next_balance);  			break;  		} @@ -7774,7 +8023,7 @@ static int idle_balance(struct rq *this_rq)  			curr_cost += domain_cost;  		} -		update_next_balance(sd, 0, &next_balance); +		update_next_balance(sd, &next_balance);  		/*  		 * Stop searching for tasks to pull if there are @@ -7864,15 +8113,15 @@ static int active_load_balance_cpu_stop(void *data)  			.idle		= CPU_IDLE,  		}; -		schedstat_inc(sd, alb_count); +		schedstat_inc(sd->alb_count);  		p = detach_one_task(&env);  		if (p) { -			schedstat_inc(sd, alb_pushed); +			schedstat_inc(sd->alb_pushed);  			/* Active balancing done, reset the failure counter. */  			sd->nr_balance_failed = 0;  		} else { -			schedstat_inc(sd, alb_failed); +			schedstat_inc(sd->alb_failed);  		}  	}  	rcu_read_unlock(); @@ -7964,13 +8213,13 @@ static inline void set_cpu_sd_state_busy(void)  	int cpu = smp_processor_id();  	rcu_read_lock(); -	sd = rcu_dereference(per_cpu(sd_busy, cpu)); +	sd = rcu_dereference(per_cpu(sd_llc, cpu));  	if (!sd || !sd->nohz_idle)  		goto unlock;  	sd->nohz_idle = 0; -	atomic_inc(&sd->groups->sgc->nr_busy_cpus); +	atomic_inc(&sd->shared->nr_busy_cpus);  unlock:  	rcu_read_unlock();  } @@ -7981,13 +8230,13 @@ void set_cpu_sd_state_idle(void)  	int cpu = smp_processor_id();  	rcu_read_lock(); -	sd = rcu_dereference(per_cpu(sd_busy, cpu)); +	sd = rcu_dereference(per_cpu(sd_llc, cpu));  	if (!sd || sd->nohz_idle)  		goto unlock;  	sd->nohz_idle = 1; -	atomic_dec(&sd->groups->sgc->nr_busy_cpus); +	atomic_dec(&sd->shared->nr_busy_cpus);  unlock:  	rcu_read_unlock();  } @@ -8214,8 +8463,8 @@ end:  static inline bool nohz_kick_needed(struct rq *rq)  {  	unsigned long now = jiffies; +	struct sched_domain_shared *sds;  	struct sched_domain *sd; -	struct sched_group_capacity *sgc;  	int nr_busy, cpu = rq->cpu;  	bool kick = false; @@ -8243,11 +8492,13 @@ static inline bool nohz_kick_needed(struct rq *rq)  		return true;  	rcu_read_lock(); -	sd = rcu_dereference(per_cpu(sd_busy, cpu)); -	if (sd) { -		sgc = sd->groups->sgc; -		nr_busy = atomic_read(&sgc->nr_busy_cpus); - +	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); +	if (sds) { +		/* +		 * XXX: write a coherent comment on why we do this. +		 * See also: http://lkml.kernel.org/r/[email protected] +		 */ +		nr_busy = atomic_read(&sds->nr_busy_cpus);  		if (nr_busy > 1) {  			kick = true;  			goto unlock; @@ -8283,7 +8534,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }   * run_rebalance_domains is triggered when needed from the scheduler tick.   * Also triggered for nohz idle balancing (with nohz_balancing_kick set).   */ -static void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void run_rebalance_domains(struct softirq_action *h)  {  	struct rq *this_rq = this_rq();  	enum cpu_idle_type idle = this_rq->idle_balance ? @@ -8441,7 +8692,6 @@ static void detach_task_cfs_rq(struct task_struct *p)  	struct sched_entity *se = &p->se;  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	u64 now = cfs_rq_clock_task(cfs_rq); -	int tg_update;  	if (!vruntime_normalized(p)) {  		/* @@ -8453,10 +8703,9 @@ static void detach_task_cfs_rq(struct task_struct *p)  	}  	/* Catch up with the cfs_rq and remove our load when we leave */ -	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); +	update_cfs_rq_load_avg(now, cfs_rq, false);  	detach_entity_load_avg(cfs_rq, se); -	if (tg_update) -		update_tg_load_avg(cfs_rq, false); +	update_tg_load_avg(cfs_rq, false);  }  static void attach_task_cfs_rq(struct task_struct *p) @@ -8464,7 +8713,6 @@ static void attach_task_cfs_rq(struct task_struct *p)  	struct sched_entity *se = &p->se;  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	u64 now = cfs_rq_clock_task(cfs_rq); -	int tg_update;  #ifdef CONFIG_FAIR_GROUP_SCHED  	/* @@ -8475,10 +8723,9 @@ static void attach_task_cfs_rq(struct task_struct *p)  #endif  	/* Synchronize task with its cfs_rq */ -	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); +	update_cfs_rq_load_avg(now, cfs_rq, false);  	attach_entity_load_avg(cfs_rq, se); -	if (tg_update) -		update_tg_load_avg(cfs_rq, false); +	update_tg_load_avg(cfs_rq, false);  	if (!vruntime_normalized(p))  		se->vruntime += cfs_rq->min_vruntime; @@ -8592,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  {  	struct sched_entity *se;  	struct cfs_rq *cfs_rq; -	struct rq *rq;  	int i;  	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8607,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  	init_cfs_bandwidth(tg_cfs_bandwidth(tg));  	for_each_possible_cpu(i) { -		rq = cpu_rq(i); -  		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),  				      GFP_KERNEL, cpu_to_node(i));  		if (!cfs_rq) |