diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 1020 |
1 files changed, 522 insertions, 498 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6559d197e08a..c95880e216f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,7 +20,9 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/topology.h> + #include <linux/latencytop.h> #include <linux/cpumask.h> #include <linux/cpuidle.h> @@ -367,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) } /* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -461,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -715,18 +718,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP + +#include "sched-pelt.h" + static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are - * dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ - /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) { @@ -1385,7 +1382,6 @@ static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long capacity_of(int cpu); -static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -1551,7 +1547,7 @@ static void task_numa_compare(struct task_numa_env *env, */ if (cur) { /* Skip this swap candidate if cannot move to the source cpu */ - if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) + if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) goto unlock; /* @@ -1661,7 +1657,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) + if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) continue; env->dst_cpu = cpu; @@ -2473,7 +2469,8 @@ void task_numa_work(struct callback_head *work) return; - down_read(&mm->mmap_sem); + if (!down_read_trylock(&mm->mmap_sem)) + return; vma = find_vma(mm, start); if (!vma) { reset_ptenuma_scan(p); @@ -2588,6 +2585,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } } + +/* + * Can a task be moved from prev_cpu to this_cpu without causing a load + * imbalance that would trigger the load balancer? + */ +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + struct numa_stats prev_load, this_load; + s64 this_eff_load, prev_eff_load; + + update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); + update_numa_stats(&this_load, cpu_to_node(this_cpu)); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) { + unsigned long current_load = task_h_load(current); + + if (this_load.load > current_load) + this_load.load -= current_load; + else + this_load.load = 0; + } + + /* + * In low-load situations, where this_cpu's node is idle due to the + * sync cause above having dropped this_load.load to 0, move the task. + * Moving to an idle socket will not create a bad imbalance. + * + * Otherwise check if the nodes are near enough in load to allow this + * task to be woken on this_cpu's node. + */ + if (this_load.load > 0) { + unsigned long task_load = task_h_load(p); + + this_eff_load = 100; + this_eff_load *= prev_load.compute_capacity; + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= this_load.compute_capacity; + + this_eff_load *= this_load.load + task_load; + prev_eff_load *= prev_load.load - task_load; + + return this_eff_load <= prev_eff_load; + } + + return true; +} #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2600,6 +2651,15 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } + +#ifdef CONFIG_SMP +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + return true; +} +#endif /* !SMP */ #endif /* CONFIG_NUMA_BALANCING */ static void @@ -2657,6 +2717,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) if (tg_weight) shares /= tg_weight; + /* + * MIN_SHARES has to be unscaled here to support per-CPU partitioning + * of a group with small tg->shares value. It is a floor value which is + * assigned as a minimum load.weight to the sched_entity representing + * the group on a CPU. + * + * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024 + * on an 8-core system with 8 tasks each runnable on one CPU shares has + * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In + * case no task is runnable on a CPU MIN_SHARES=2 should be returned + * instead of 0. + */ if (shares < MIN_SHARES) shares = MIN_SHARES; if (shares > tg->shares) @@ -2689,16 +2761,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct sched_entity *se) { + struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg; - struct sched_entity *se; long shares; - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) + if (!cfs_rq) + return; + + if (throttled_hierarchy(cfs_rq)) return; + + tg = cfs_rq->tg; + #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) return; @@ -2707,54 +2783,23 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } + #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* Precomputed fixed inverse multiplies for multiplication by y^n */ -static const u32 runnable_avg_yN_inv[] = { - 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, - 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, - 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, - 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, - 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, - 0x85aac367, 0x82cd8698, -}; - -/* - * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent - * over-estimates when re-combining. - */ -static const u32 runnable_avg_yN_sum[] = { - 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, - 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, - 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, -}; - -/* - * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to - * lower integers. See Documentation/scheduler/sched-avg.txt how these - * were generated: - */ -static const u32 __accumulated_sum_N32[] = { - 0, 23371, 35056, 40899, 43820, 45281, - 46011, 46376, 46559, 46650, 46696, 46719, -}; - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ -static __always_inline u64 decay_load(u64 val, u64 n) +static u64 decay_load(u64 val, u64 n) { unsigned int local_n; - if (!n) - return val; - else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + if (unlikely(n > LOAD_AVG_PERIOD * 63)) return 0; /* after bounds checking we can collapse to 32-bit */ @@ -2776,30 +2821,97 @@ static __always_inline u64 decay_load(u64 val, u64 n) return val; } +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) +{ + u32 c1, c2, c3 = d3; /* y^0 == 1 */ + + /* + * c1 = d1 y^p + */ + c1 = decay_load((u64)d1, periods); + + /* + * p-1 + * c2 = 1024 \Sum y^n + * n=1 + * + * inf inf + * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) + * n=0 n=p + */ + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; + + return c1 + c2 + c3; +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* - * For updates fully spanning n periods, the contribution to runnable - * average will be: \Sum 1024*y^n + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p-1 + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 + * n=1 * - * We can compute this reasonably efficiently by combining: - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} + * = u y^p + (Step 1) + * + * p-1 + * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 */ -static u32 __compute_runnable_contrib(u64 n) +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u32 contrib = 0; + unsigned long scale_freq, scale_cpu; + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ + u64 periods; - if (likely(n <= LOAD_AVG_PERIOD)) - return runnable_avg_yN_sum[n]; - else if (unlikely(n >= LOAD_AVG_MAX_N)) - return LOAD_AVG_MAX; + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; - n %= LOAD_AVG_PERIOD; - contrib = decay_load(contrib, n); - return contrib + runnable_avg_yN_sum[n]; -} + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} /* * We can represent the historical contribution to runnable average as the @@ -2830,13 +2942,10 @@ static u32 __compute_runnable_contrib(u64 n) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -__update_load_avg(u64 now, int cpu, struct sched_avg *sa, +___update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, scaled_delta, periods; - u32 contrib; - unsigned int delta_w, scaled_delta_w, decayed = 0; - unsigned long scale_freq, scale_cpu; + u64 delta; delta = now - sa->last_update_time; /* @@ -2855,83 +2964,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, delta >>= 10; if (!delta) return 0; - sa->last_update_time = now; - scale_freq = arch_scale_freq_capacity(NULL, cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->period_contrib; - if (delta + delta_w >= 1024) { - decayed = 1; + sa->last_update_time += delta << 10; - /* how much left for next period will start over, we don't know yet */ - sa->period_contrib = 0; - - /* - * Now that we know we're crossing a period boundary, figure - * out how much from delta we need to complete the current - * period and accrue it. - */ - delta_w = 1024 - delta_w; - scaled_delta_w = cap_scale(delta_w, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta_w; - if (cfs_rq) { - cfs_rq->runnable_load_sum += - weight * scaled_delta_w; - } - } - if (running) - sa->util_sum += scaled_delta_w * scale_cpu; - - delta -= delta_w; - - /* Figure out how many additional periods this update spans */ - periods = delta / 1024; - delta %= 1024; + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + return 0; - sa->load_sum = decay_load(sa->load_sum, periods + 1); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods + 1); - } - sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); - - /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - contrib = __compute_runnable_contrib(periods); - contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } - if (running) - sa->util_sum += contrib * scale_cpu; + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); } + sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); - /* Remainder of delta accrued against u_0` */ - scaled_delta = cap_scale(delta, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * scaled_delta; - } - if (running) - sa->util_sum += scaled_delta * scale_cpu; + return 1; +} - sa->period_contrib += delta; +static int +__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); +} - if (decayed) { - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); - } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - } +static int +__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); +} - return decayed; +static int +__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +{ + return ___update_load_avg(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + cfs_rq->curr != NULL, cfs_rq); } /* @@ -2968,8 +3046,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, * differential update where we store the last value we propagated. This in * turn allows skipping updates if the differential is 'small'. * - * Updating tg's load_avg is necessary before update_cfs_share() (which is - * done) and effective_load() (which is not done because it is too costly). + * Updating tg's load_avg is necessary before update_cfs_share(). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -2995,6 +3072,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { + u64 p_last_update_time; + u64 n_last_update_time; + if (!sched_feat(ATTACH_AGE_LOAD)) return; @@ -3005,11 +3085,11 @@ void set_task_rq_fair(struct sched_entity *se, * time. This will result in the wakee task is less decayed, but giving * the wakee more load sounds not bad. */ - if (se->avg.last_update_time && prev) { - u64 p_last_update_time; - u64 n_last_update_time; + if (!(se->avg.last_update_time && prev)) + return; #ifndef CONFIG_64BIT + { u64 p_last_update_time_copy; u64 n_last_update_time_copy; @@ -3024,14 +3104,13 @@ void set_task_rq_fair(struct sched_entity *se, } while (p_last_update_time != p_last_update_time_copy || n_last_update_time != n_last_update_time_copy); + } #else - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; #endif - __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), - &se->avg, 0, 0, NULL); - se->avg.last_update_time = n_last_update_time; - } + __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); + se->avg.last_update_time = n_last_update_time; } /* Take into account change of utilization of a child task group */ @@ -3154,6 +3233,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 1; } +/* + * Check if we need to update the load and the utilization of a blocked + * group_entity: + */ +static inline bool skip_blocked_update(struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + /* + * If sched_entity still have not zero load or utilization, we have to + * decay it: + */ + if (se->avg.load_avg || se->avg.util_avg) + return false; + + /* + * If there is a pending propagation, we have to update the load and + * the utilization of the sched_entity: + */ + if (gcfs_rq->propagate_avg) + return false; + + /* + * Otherwise, the load and the utilization of the sched_entity is + * already zero and there is no pending propagation, so it will be a + * waste of time to try to decay it: + */ + return true; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} @@ -3246,8 +3355,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) set_tg_cfs_propagate(cfs_rq); } - decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); + decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -3279,11 +3387,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags) * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { - __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cpu, cfs_rq, se); decayed = update_cfs_rq_load_avg(now, cfs_rq, true); decayed |= propagate_entity_load_avg(se); @@ -3388,7 +3493,7 @@ void sync_entity_load_avg(struct sched_entity *se) u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); } /* @@ -3424,7 +3529,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int idle_balance(struct rq *this_rq); +static int idle_balance(struct rq *this_rq, struct rq_flags *rf); #else /* CONFIG_SMP */ @@ -3453,7 +3558,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq) +static inline int idle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -3521,7 +3626,7 @@ static inline void check_schedstat_required(void) trace_sched_stat_runtime_enabled()) { printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " "stat_blocked and stat_runtime require the " - "kernel parameter schedstats=enabled or " + "kernel parameter schedstats=enable or " "kernel.sched_schedstats=1\n"); } #endif @@ -3582,10 +3687,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; + /* + * When enqueuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its weight to reflect the new share of + * its group cfs_rq + * - Add its new weight to cfs_rq->load.weight + */ update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); + update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); @@ -3657,6 +3770,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Substract its load from the cfs_rq->runnable_avg. + * - Substract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); @@ -3681,7 +3803,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); /* * Now advance min_vruntime if @se was the entity holding it back, @@ -3864,7 +3986,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_load_avg(curr, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -4235,8 +4357,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4253,7 +4376,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); if (!remaining) break; @@ -4582,24 +4705,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } +/* + * Both these cpu hotplug callbacks race against unregister_fair_sched_group() + * + * The race is harmless, since modifying bandwidth settings of unhooked group + * bits doesn't do much. + */ + +/* cpu online calback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; raw_spin_lock(&cfs_b->lock); cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; raw_spin_unlock(&cfs_b->lock); } + rcu_read_unlock(); } +/* cpu offline callback */ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; + + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - for_each_leaf_cfs_rq(rq, cfs_rq) { if (!cfs_rq->runtime_enabled) continue; @@ -4617,6 +4759,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } + rcu_read_unlock(); } #else /* CONFIG_CFS_BANDWIDTH */ @@ -4761,7 +4904,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -4820,7 +4963,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -5061,15 +5204,16 @@ void cpu_load_update_nohz_stop(void) unsigned long curr_jiffies = READ_ONCE(jiffies); struct rq *this_rq = this_rq(); unsigned long load; + struct rq_flags rf; if (curr_jiffies == this_rq->last_load_update_tick) return; load = weighted_cpuload(cpu_of(this_rq)); - raw_spin_lock(&this_rq->lock); + rq_lock(this_rq, &rf); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); - raw_spin_unlock(&this_rq->lock); + rq_unlock(this_rq, &rf); } #else /* !CONFIG_NO_HZ_COMMON */ static inline void cpu_load_update_nohz(struct rq *this_rq, @@ -5154,126 +5298,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } -#ifdef CONFIG_FAIR_GROUP_SCHED -/* - * effective_load() calculates the load change as seen from the root_task_group - * - * Adding load to a group doesn't make a group heavier, but can cause movement - * of group shares between cpus. Assuming the shares were perfectly aligned one - * can calculate the shift in shares. - * - * Calculate the effective load difference if @wl is added (subtracted) to @tg - * on this @cpu and results in a total addition (subtraction) of @wg to the - * total group weight. - * - * Given a runqueue weight distribution (rw_i) we can compute a shares - * distribution (s_i) using: - * - * s_i = rw_i / \Sum rw_j (1) - * - * Suppose we have 4 CPUs and our @tg is a direct child of the root group and - * has 7 equal weight tasks, distributed as below (rw_i), with the resulting - * shares distribution (s_i): - * - * rw_i = { 2, 4, 1, 0 } - * s_i = { 2/7, 4/7, 1/7, 0 } - * - * As per wake_affine() we're interested in the load of two CPUs (the CPU the - * task used to run on and the CPU the waker is running on), we need to - * compute the effect of waking a task on either CPU and, in case of a sync - * wakeup, compute the effect of the current task going to sleep. - * - * So for a change of @wl to the local @cpu with an overall group weight change - * of @wl we can compute the new shares distribution (s'_i) using: - * - * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) - * - * Suppose we're interested in CPUs 0 and 1, and want to compute the load - * differences in waking a task to CPU 0. The additional task changes the - * weight and shares distributions like: - * - * rw'_i = { 3, 4, 1, 0 } - * s'_i = { 3/8, 4/8, 1/8, 0 } - * - * We can then compute the difference in effective weight by using: - * - * dw_i = S * (s'_i - s_i) (3) - * - * Where 'S' is the group weight as seen by its parent. - * - * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) - * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - - * 4/7) times the weight of the group. - */ -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) -{ - struct sched_entity *se = tg->se[cpu]; - - if (!tg->parent) /* the trivial, non-cgroup case */ - return wl; - - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = se->my_q; - long W, w = cfs_rq_load_avg(cfs_rq); - - tg = cfs_rq->tg; - - /* - * W = @wg + \Sum rw_j - */ - W = wg + atomic_long_read(&tg->load_avg); - - /* Ensure \Sum rw_j >= rw_i */ - W -= cfs_rq->tg_load_avg_contrib; - W += w; - - /* - * w = rw_i + @wl - */ - w += wl; - - /* - * wl = S * s'_i; see (2) - */ - if (W > 0 && w < W) - wl = (w * (long)scale_load_down(tg->shares)) / W; - else - wl = scale_load_down(tg->shares); - - /* - * Per the above, wl is the new se->load.weight value; since - * those are clipped to [MIN_SHARES, ...) do so now. See - * calc_cfs_shares(). - */ - if (wl < MIN_SHARES) - wl = MIN_SHARES; - - /* - * wl = dw_i = S * (s'_i - s_i); see (3) - */ - wl -= se->avg.load_avg; - - /* - * Recursively apply this logic to all parent groups to compute - * the final effective load change on the root group. Since - * only the @tg group gets extra weight, all parent groups can - * only redistribute existing shares. @wl is the shift in shares - * resulting from this level per the above. - */ - wg = 0; - } - - return wl; -} -#else - -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) -{ - return wl; -} - -#endif - static void record_wakee(struct task_struct *p) { /* @@ -5324,67 +5348,25 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { - s64 this_load, load; - s64 this_eff_load, prev_eff_load; - int idx, this_cpu; - struct task_group *tg; - unsigned long weight; - int balanced; - - idx = sd->wake_idx; - this_cpu = smp_processor_id(); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + int this_cpu = smp_processor_id(); + bool affine = false; /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: + * Common case: CPUs are in the same socket, and select_idle_sibling() + * will do its thing regardless of what we return: */ - if (sync) { - tg = task_group(current); - weight = current->se.avg.load_avg; - - this_load += effective_load(tg, this_cpu, -weight, -weight); - load += effective_load(tg, prev_cpu, 0, -weight); - } - - tg = task_group(p); - weight = p->se.avg.load_avg; - - /* - * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped this_load to 0, we'll - * always have an imbalance, but there's really nothing you can do - * about that, so that's good too. - * - * Otherwise check if either cpus are near enough in load to allow this - * task to be woken on this_cpu. - */ - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); - - if (this_load > 0) { - this_eff_load *= this_load + - effective_load(tg, this_cpu, weight, weight); - - prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); - } - - balanced = this_eff_load <= prev_eff_load; + if (cpus_share_cache(prev_cpu, this_cpu)) + affine = true; + else + affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); + if (affine) { + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); + } - if (!balanced) - return 0; - - schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); - - return 1; + return affine; } static inline int task_util(struct task_struct *p); @@ -5423,12 +5405,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), - tsk_cpus_allowed(p))) + if (!cpumask_intersects(sched_group_span(group), + &p->cpus_allowed)) continue; local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + sched_group_span(group)); /* * Tally up the load of all CPUs in the group and find @@ -5438,7 +5420,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, runnable_load = 0; max_spare_cap = 0; - for_each_cpu(i, sched_group_cpus(group)) { + for_each_cpu(i, sched_group_span(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -5541,10 +5523,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Check if we have any choice: */ if (group->group_weight == 1) - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { + for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5579,43 +5561,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -/* - * Implement a for_each_cpu() variant that starts the scan at a given cpu - * (@start), and wraps around. - * - * This is used to scan for idle CPUs; such that not all CPUs looking for an - * idle CPU find the same CPU. The down-side is that tasks tend to cycle - * through the LLC domain. - * - * Especially tbench is found sensitive to this. - */ - -static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) -{ - int next; - -again: - next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); - - if (*wrapped) { - if (next >= start) - return nr_cpumask_bits; - } else { - if (next >= nr_cpumask_bits) { - *wrapped = 1; - n = -1; - goto again; - } - } - - return next; -} - -#define for_each_cpu_wrap(cpu, mask, start, wrap) \ - for ((wrap) = 0, (cpu) = (start)-1; \ - (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ - (cpu) < nr_cpumask_bits; ) - #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -5675,7 +5620,7 @@ unlock: static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu, wrap; + int core, cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5683,9 +5628,9 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int if (!test_idle_cores(target, false)) return -1; - cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); + cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); - for_each_cpu_wrap(core, cpus, target, wrap) { + for_each_cpu_wrap(core, cpus, target) { bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -5717,7 +5662,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) return cpu; @@ -5748,28 +5693,39 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct sched_domain *this_sd; - u64 avg_cost, avg_idle = this_rq()->avg_idle; + u64 avg_cost, avg_idle; u64 time, cost; s64 delta; - int cpu, wrap; + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1; - avg_cost = this_sd->avg_scan_cost; - /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. */ - if ((avg_idle / 512) < avg_cost) + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; + + if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) return -1; + if (sched_feat(SIS_PROP)) { + u64 span_avg = sd->span_weight * avg_idle; + if (span_avg > 4*avg_cost) + nr = div_u64(span_avg, avg_cost); + else + nr = 4; + } + time = local_clock(); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { - if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + if (!--nr) + return -1; + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) break; @@ -5924,7 +5880,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + && cpumask_test_cpu(cpu, &p->cpus_allowed); } rcu_read_lock(); @@ -5950,11 +5906,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) + if (cpu == prev_cpu) + goto pick_cpu; + + if (wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { + pick_cpu: if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); @@ -6107,8 +6067,11 @@ static void set_last_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->last = se; + } } static void set_next_buddy(struct sched_entity *se) @@ -6116,8 +6079,11 @@ static void set_next_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->next = se; + } } static void set_skip_buddy(struct sched_entity *se) @@ -6213,7 +6179,7 @@ preempt: } static struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -6320,15 +6286,8 @@ simple: return p; idle: - /* - * This is OK, because current is on_cpu, which avoids it being picked - * for load-balance and preemption/IRQs are still disabled avoiding - * further scheduler activity on it and we're being very careful to - * re-start the picking loop. - */ - lockdep_unpin_lock(&rq->lock, cookie); - new_tasks = idle_balance(rq); - lockdep_repin_lock(&rq->lock, cookie); + new_tasks = idle_balance(rq, rf); + /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -6632,6 +6591,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (dst_nid == p->numa_preferred_nid) return 0; + /* Leaving a core idle is often worse than degrading locality. */ + if (env->idle != CPU_NOT_IDLE) + return -1; + if (numa_group) { src_faults = group_faults(p, src_nid); dst_faults = group_faults(p, dst_nid); @@ -6671,7 +6634,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; - if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { int cpu; schedstat_inc(p->se.statistics.nr_failed_migrations_affine); @@ -6683,15 +6646,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * - * Also avoid computing new_dst_cpu if we have already computed - * one in current iteration. + * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have + * already computed one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) + if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; @@ -6740,7 +6703,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env) lockdep_assert_held(&env->src_rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - deactivate_task(env->src_rq, p, 0); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -6873,7 +6836,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -6884,9 +6847,12 @@ static void attach_task(struct rq *rq, struct task_struct *p) */ static void attach_one_task(struct rq *rq, struct task_struct *p) { - raw_spin_lock(&rq->lock); + struct rq_flags rf; + + rq_lock(rq, &rf); + update_rq_clock(rq); attach_task(rq, p); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -6897,8 +6863,10 @@ static void attach_tasks(struct lb_env *env) { struct list_head *tasks = &env->tasks; struct task_struct *p; + struct rq_flags rf; - raw_spin_lock(&env->dst_rq->lock); + rq_lock(env->dst_rq, &rf); + update_rq_clock(env->dst_rq); while (!list_empty(tasks)) { p = list_first_entry(tasks, struct task_struct, se.group_node); @@ -6907,24 +6875,44 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } - raw_spin_unlock(&env->dst_rq->lock); + rq_unlock(env->dst_rq, &rf); } #ifdef CONFIG_FAIR_GROUP_SCHED + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; - unsigned long flags; + struct cfs_rq *cfs_rq, *pos; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { + struct sched_entity *se; + /* throttled entities do not contribute to load */ if (throttled_hierarchy(cfs_rq)) continue; @@ -6932,11 +6920,19 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); - /* Propagate pending load changes to the parent */ - if (cfs_rq->tg->se[cpu]) - update_load_avg(cfs_rq->tg->se[cpu], 0); + /* Propagate pending load changes to the parent, if any: */ + se = cfs_rq->tg->se[cpu]; + if (se && !skip_blocked_update(se)) + update_load_avg(se, 0); + + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } /* @@ -6990,12 +6986,12 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } static unsigned long task_h_load(struct task_struct *p) @@ -7167,7 +7163,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) { + for_each_cpu(cpu, sched_group_span(sdg)) { struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); @@ -7225,7 +7221,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) /* * Group imbalance indicates (and tries to solve) the problem where balancing - * groups is inadequate due to tsk_cpus_allowed() constraints. + * groups is inadequate due to ->cpus_allowed constraints. * * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a * cpumask covering 1 cpu of the first group and 3 cpus of the second group. @@ -7346,7 +7342,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, memset(sgs, 0, sizeof(*sgs)); - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); /* Bias balancing toward cpus of our domain */ @@ -7496,6 +7492,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; + struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; bool overload = false; @@ -7509,10 +7506,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *sgs = &tmp_sgs; int local_group; - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg)); if (local_group) { sds->local = sg; - sgs = &sds->local_stat; + sgs = local; if (env->idle != CPU_NEWLY_IDLE || time_after_eq(jiffies, sg->sgc->next_update)) @@ -7536,8 +7533,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * the tasks on the system). */ if (prefer_sibling && sds->local && - group_has_capacity(env, &sds->local_stat) && - (sgs->sum_nr_running > 1)) { + group_has_capacity(env, local) && + (sgs->sum_nr_running > local->sum_nr_running + 1)) { sgs->group_no_capacity = 1; sgs->group_type = group_classify(sg, sgs); } @@ -7568,7 +7565,7 @@ next_group: /** * check_asym_packing - Check to see if the group is packed into the - * sched doman. + * sched domain. * * This is primarily intended to used at the sibling level. Some * cores like POWER7 prefer to use lower numbered SMT threads. In the @@ -7864,7 +7861,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; @@ -7970,7 +7967,6 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - struct cpumask *sg_cpus, *sg_mask; int cpu, balance_cpu = -1; /* @@ -7980,11 +7976,9 @@ static int should_we_balance(struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE) return 1; - sg_cpus = sched_group_cpus(sg); - sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ - for_each_cpu_and(cpu, sg_cpus, env->cpus) { - if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { + if (!idle_cpu(cpu)) continue; balance_cpu = cpu; @@ -8013,14 +8007,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; - unsigned long flags; + struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), + .dst_grpmask = sched_group_span(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, @@ -8028,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .tasks = LIST_HEAD_INIT(env.tasks), }; - /* - * For NEWLY_IDLE load_balancing, we don't need to consider - * other cpus in our group - */ - if (idle == CPU_NEWLY_IDLE) - env.dst_grpmask = NULL; - - cpumask_copy(cpus, cpu_active_mask); + cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); schedstat_inc(sd->lb_count[idle]); @@ -8076,7 +8063,8 @@ redo: env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - raw_spin_lock_irqsave(&busiest->lock, flags); + rq_lock_irqsave(busiest, &rf); + update_rq_clock(busiest); /* * cur_ld_moved - load moved in current iteration @@ -8092,14 +8080,14 @@ more_balance: * See task_rq_lock() family for the details. */ - raw_spin_unlock(&busiest->lock); + rq_unlock(busiest, &rf); if (cur_ld_moved) { attach_tasks(&env); ld_moved += cur_ld_moved; } - local_irq_restore(flags); + local_irq_restore(rf.flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; @@ -8156,7 +8144,15 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) { + /* + * Attempting to continue load balancing at the current + * sched_domain level only makes sense if there are + * active CPUs remaining as possible busiest CPUs to + * pull load from which are not contained within the + * destination group that is receiving any migrated + * load. + */ + if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; env.loop_break = sched_nr_migrate_break; goto redo; @@ -8177,14 +8173,15 @@ more_balance: sd->nr_balance_failed++; if (need_active_balance(&env)) { + unsigned long flags; + raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, * if the curr task on busiest cpu can't be * moved to this_cpu */ - if (!cpumask_test_cpu(this_cpu, - tsk_cpus_allowed(busiest->curr))) { + if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); env.flags |= LBF_ALL_PINNED; @@ -8297,7 +8294,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static int idle_balance(struct rq *this_rq) +static int idle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -8311,6 +8308,14 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + rq_unpin_lock(this_rq, rf); + if (this_rq->avg_idle < sysctl_sched_migration_cost || !this_rq->rd->overload) { rcu_read_lock(); @@ -8388,6 +8393,8 @@ out: if (pulled_task) this_rq->idle_stamp = 0; + rq_repin_lock(this_rq, rf); + return pulled_task; } @@ -8405,8 +8412,9 @@ static int active_load_balance_cpu_stop(void *data) struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; struct task_struct *p = NULL; + struct rq_flags rf; - raw_spin_lock_irq(&busiest_rq->lock); + rq_lock_irq(busiest_rq, &rf); /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || @@ -8440,9 +8448,17 @@ static int active_load_balance_cpu_stop(void *data) .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, + /* + * can_migrate_task() doesn't need to compute new_dst_cpu + * for active balancing. Since we have CPU_IDLE, but no + * @dst_grpmask we need to make that test go away with lying + * about DST_PINNED. + */ + .flags = LBF_DST_PINNED, }; schedstat_inc(sd->alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { @@ -8456,7 +8472,7 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock(&busiest_rq->lock); + rq_unlock(busiest_rq, &rf); if (p) attach_one_task(target_rq, p); @@ -8582,6 +8598,10 @@ void nohz_balance_enter_idle(int cpu) if (!cpu_active(cpu)) return; + /* Spare idle load balancing on CPUs that don't want to be disturbed: */ + if (!is_housekeeping_cpu(cpu)) + return; + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; @@ -8754,10 +8774,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * do the balance. */ if (time_after_eq(jiffies, rq->next_balance)) { - raw_spin_lock_irq(&rq->lock); + struct rq_flags rf; + + rq_lock_irq(rq, &rf); update_rq_clock(rq); cpu_load_update_idle(rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); + rebalance_domains(rq, CPU_IDLE); } @@ -8948,8 +8971,9 @@ static void task_fork_fair(struct task_struct *p) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; struct rq *rq = this_rq(); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); @@ -8970,7 +8994,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -9264,6 +9288,7 @@ void online_fair_sched_group(struct task_group *tg) se = tg->se[i]; raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); @@ -9331,7 +9356,6 @@ static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - unsigned long flags; /* * We can't change the weight of the root cgroup. @@ -9348,17 +9372,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) tg->shares = shares; for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - struct sched_entity *se; + struct sched_entity *se = tg->se[i]; + struct rq_flags rf; - se = tg->se[i]; /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - - /* Possible calls to update_curr() need rq clock */ + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); - raw_spin_unlock_irqrestore(&rq->lock, flags); + for_each_sched_entity(se) { + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); + } + rq_unlock_irqrestore(rq, &rf); } done: @@ -9442,10 +9466,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } |