diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 514 |
1 files changed, 287 insertions, 227 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa4c6227cd6d..8a8bd7b13634 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -531,12 +531,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline int entity_before(struct sched_entity *a, +static inline bool entity_before(struct sched_entity *a, struct sched_entity *b) { return (s64)(a->vruntime - b->vruntime) < 0; } +#define __node_2_se(node) \ + rb_entry((node), struct sched_entity, run_node) + static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; @@ -552,8 +555,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } if (leftmost) { /* non-empty tree */ - struct sched_entity *se; - se = rb_entry(leftmost, struct sched_entity, run_node); + struct sched_entity *se = __node_2_se(leftmost); if (!curr) vruntime = se->vruntime; @@ -569,37 +571,17 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) #endif } +static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +{ + return entity_before(__node_2_se(a), __node_2_se(b)); +} + /* * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - bool leftmost = true; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (entity_before(se, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&se->run_node, parent, link); - rb_insert_color_cached(&se->run_node, - &cfs_rq->tasks_timeline, leftmost); + rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -614,7 +596,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) if (!left) return NULL; - return rb_entry(left, struct sched_entity, run_node); + return __node_2_se(left); } static struct sched_entity *__pick_next_entity(struct sched_entity *se) @@ -624,7 +606,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) if (!next) return NULL; - return rb_entry(next, struct sched_entity, run_node); + return __node_2_se(next); } #ifdef CONFIG_SCHED_DEBUG @@ -635,7 +617,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) if (!last) return NULL; - return rb_entry(last, struct sched_entity, run_node); + return __node_2_se(last); } /************************************************************** @@ -906,6 +888,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) if (!schedstat_enabled()) return; + /* + * When the sched_schedstat changes from 0 to 1, some sched se + * maybe already in the runqueue, the se->statistics.wait_start + * will be 0.So it will let the delta wrong. We need to avoid this + * scenario. + */ + if (unlikely(!schedstat_val(se->statistics.wait_start))) + return; + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); if (entity_is_task(se)) { @@ -1550,7 +1541,8 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, int nr_running); +static inline long adjust_numa_imbalance(int imbalance, + int dst_running, int dst_weight); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1930,7 +1922,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, src_running = env->src_stats.nr_running - 1; dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); - imbalance = adjust_numa_imbalance(imbalance, dst_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running, + env->dst_stats.weight); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -2928,7 +2921,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) - task_work_add(curr, work, true); + task_work_add(curr, work, TWA_RESUME); } } @@ -3932,6 +3925,22 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, trace_sched_util_est_cfs_tp(cfs_rq); } +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + /* * Check if a (signed) value is within a specified (unsigned) margin, * based on the observation that: @@ -3945,23 +3954,16 @@ static inline bool within_margin(int value, int margin) return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); } -static void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +static inline void util_est_update(struct cfs_rq *cfs_rq, + struct task_struct *p, + bool task_sleep) { long last_ewma_diff; struct util_est ue; - int cpu; if (!sched_feat(UTIL_EST)) return; - /* Update root cfs_rq's estimated utilization */ - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); - - trace_sched_util_est_cfs_tp(cfs_rq); - /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -4001,8 +4003,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - cpu = cpu_of(rq_of(cfs_rq)); - if (task_util(p) > capacity_orig_of(cpu)) + if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) return; /* @@ -4041,7 +4042,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) if (!static_branch_unlikely(&sched_asym_cpucapacity)) return; - if (!p) { + if (!p || p->nr_cpus_allowed == 1) { rq->misfit_task_load = 0; return; } @@ -4085,8 +4086,11 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -4779,25 +4783,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) - break; + goto done; - if (dequeue) { - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - } else { - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - } + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; - if (qcfs_rq->load.weight) - dequeue = 0; + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; + } } - if (!se) - sub_nr_running(rq, task_delta); + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + } + + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta); +done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. @@ -5105,9 +5121,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) return; distribute_cfs_runtime(cfs_b); - - raw_spin_lock_irqsave(&cfs_b->lock, flags); - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } /* @@ -5399,7 +5412,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) s64 delta = slice - ran; if (delta < 0) { - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); return; } @@ -5416,7 +5429,7 @@ static void hrtick_update(struct rq *rq) { struct task_struct *curr = rq->curr; - if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) @@ -5477,6 +5490,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); /* * The code below (indirectly) updates schedutil which looks at @@ -5549,7 +5563,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * into account, but that is not straightforward to implement, * and the following generally works well enough in practice. */ - if (flags & ENQUEUE_WAKEUP) + if (!task_new) update_overutilized_status(rq); enqueue_throttle: @@ -5588,6 +5602,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + util_est_dequeue(&rq->cfs, p); + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -5638,7 +5654,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) rq->next_balance = jiffies; dequeue_throttle: - util_est_dequeue(&rq->cfs, p, task_sleep); + util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5804,6 +5820,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; + if (available_idle_cpu(prev_cpu)) + return prev_cpu; + return nr_cpumask_bits; } @@ -5982,6 +6001,14 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; } +static inline int __select_idle_cpu(int cpu) +{ + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + return cpu; + + return -1; +} + #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -6040,73 +6067,51 @@ unlock: * there are no idle cores left in the system; tracked through * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. */ -static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu; + bool idle = true; + int cpu; if (!static_branch_likely(&sched_smt_present)) - return -1; - - if (!test_idle_cores(target, false)) - return -1; + return __select_idle_cpu(core); - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - - for_each_cpu_wrap(core, cpus, target) { - bool idle = true; - - for_each_cpu(cpu, cpu_smt_mask(core)) { - if (!available_idle_cpu(cpu)) { - idle = false; - break; + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (!available_idle_cpu(cpu)) { + idle = false; + if (*idle_cpu == -1) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { + *idle_cpu = cpu; + break; + } + continue; } + break; } - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); - - if (idle) - return core; + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr)) + *idle_cpu = cpu; } - /* - * Failed to find an idle core; stop looking for one. - */ - set_idle_cores(target, 0); + if (idle) + return core; + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); return -1; } -/* - * Scan the local SMT mask for idle CPUs. - */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - int cpu; - - if (!static_branch_likely(&sched_smt_present)) - return -1; - - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || - !cpumask_test_cpu(cpu, sched_domain_span(sd))) - continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - return cpu; - } +#else /* CONFIG_SCHED_SMT */ - return -1; +static inline void set_idle_cores(int cpu, int val) +{ } -#else /* CONFIG_SCHED_SMT */ - -static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static inline bool test_idle_cores(int cpu, bool def) { - return -1; + return def; } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - return -1; + return __select_idle_cpu(core); } #endif /* CONFIG_SCHED_SMT */ @@ -6119,49 +6124,61 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int i, cpu, idle_cpu = -1, nr = INT_MAX; + bool smt = test_idle_cores(target, false); + int this = smp_processor_id(); struct sched_domain *this_sd; - u64 avg_cost, avg_idle; u64 time; - int this = smp_processor_id(); - int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1; - /* - * Due to large variance we need a large fuzz factor; hackbench in - * particularly is sensitive here. - */ - avg_idle = this_rq()->avg_idle / 512; - avg_cost = this_sd->avg_scan_cost + 1; + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) - return -1; + if (sched_feat(SIS_PROP) && !smt) { + u64 avg_cost, avg_idle, span_avg; + + /* + * Due to large variance we need a large fuzz factor; + * hackbench in particularly is sensitive here. + */ + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; - if (sched_feat(SIS_PROP)) { - u64 span_avg = sd->span_weight * avg_idle; + span_avg = sd->span_weight * avg_idle; if (span_avg > 4*avg_cost) nr = div_u64(span_avg, avg_cost); else nr = 4; - } - - time = cpu_clock(this); - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + time = cpu_clock(this); + } for_each_cpu_wrap(cpu, cpus, target) { - if (!--nr) - return -1; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - break; + if (smt) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + + } else { + if (!--nr) + return -1; + idle_cpu = __select_idle_cpu(cpu); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + break; + } } - time = cpu_clock(this) - time; - update_avg(&this_sd->avg_scan_cost, time); + if (smt) + set_idle_cores(this, false); + + if (sched_feat(SIS_PROP) && !smt) { + time = cpu_clock(this) - time; + update_avg(&this_sd->avg_scan_cost, time); + } - return cpu; + return idle_cpu; } /* @@ -6172,21 +6189,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long best_cap = 0; + unsigned long task_util, best_cap = 0; int cpu, best_cpu = -1; struct cpumask *cpus; - sync_entity_load_avg(&p->se); - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + task_util = uclamp_task_util(p); + for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (task_fits_capacity(p, cpu_cap)) + if (fits_capacity(task_util, cpu_cap)) return cpu; if (cpu_cap > best_cap) { @@ -6198,44 +6215,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } +static inline bool asym_fits_capacity(int task_util, int cpu) +{ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + return fits_capacity(task_util, capacity_of(cpu)); + + return true; +} + /* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; + unsigned long task_util; int i, recent_used_cpu; /* - * For asymmetric CPU capacity systems, our domain of interest is - * sd_asym_cpucapacity rather than sd_llc. + * On asymmetric system, update task utilization because we will check + * that the task fits with cpu's capacity. */ if (static_branch_unlikely(&sched_asym_cpucapacity)) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); - /* - * On an asymmetric CPU capacity system where an exclusive - * cpuset defines a symmetric island (i.e. one unique - * capacity_orig value through the cpuset), the key will be set - * but the CPUs within that cpuset will not have a domain with - * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric - * capacity path. - */ - if (!sd) - goto symmetric; - - i = select_idle_capacity(p, sd, target); - return ((unsigned)i < nr_cpumask_bits) ? i : target; + sync_entity_load_avg(&p->se); + task_util = uclamp_task_util(p); } -symmetric: - if (available_idle_cpu(target) || sched_idle_cpu(target)) + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev))) + (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + asym_fits_capacity(task_util, prev)) return prev; /* @@ -6258,7 +6273,8 @@ symmetric: recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + asym_fits_capacity(task_util, recent_used_cpu)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6267,27 +6283,39 @@ symmetric: return recent_used_cpu; } + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + /* + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. + */ + if (sd) { + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; - i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - i = select_idle_cpu(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - return target; } /** - * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks + * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks. * @cpu: the CPU to get the utilization of * * The unit of the return value must be the one of capacity so we can compare @@ -6499,7 +6527,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * is already enough to scale the EM reported power * consumption at the (eventually clamped) cpu_capacity. */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap, ENERGY_UTIL, NULL); /* @@ -6509,7 +6537,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap, FREQUENCY_UTIL, tsk); max_util = max(max_util, cpu_util); } @@ -6607,7 +6635,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * IOW, placing the task there would make the CPU * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is - * aligned with schedutil_cpu_util(). + * aligned with sched_cpu_util(). */ util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) @@ -6663,7 +6691,7 @@ fail: /* * select_task_rq_fair: Select target runqueue for the waking task in domains - * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * * Balances load by selecting the idlest CPU in the idlest group, or under @@ -6674,15 +6702,17 @@ fail: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) { + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; - int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + /* SD_flags and WF_flags share the first nibble */ + int sd_flag = wake_flags & 0xF; - if (sd_flag & SD_BALANCE_WAKE) { + if (wake_flags & WF_TTWU) { record_wakee(p); if (sched_energy_enabled()) { @@ -6719,9 +6749,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (unlikely(sd)) { /* Slow path */ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); - } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); if (want_affine) @@ -7087,7 +7116,7 @@ done: __maybe_unused; list_move(&p->se.group_node, &rq->cfs_tasks); #endif - if (hrtick_enabled(rq)) + if (hrtick_enabled_fair(rq)) hrtick_start_fair(rq, p); update_misfit_status(p, rq); @@ -8738,6 +8767,16 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* + * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. + * This is an approximation as the number of running tasks may not be + * related to the number of busy CPUs due to sched_setaffinity. + */ +static inline bool allow_numa_imbalance(int dst_running, int dst_weight) +{ + return (dst_running < (dst_weight >> 2)); +} + +/* * find_idlest_group() finds and returns the least busy CPU group within the * domain. * @@ -8755,9 +8794,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) .group_type = group_overloaded, }; - imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - do { int local_group; @@ -8811,6 +8847,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) switch (local_sgs.group_type) { case group_overloaded: case group_fully_busy: + + /* Calculate allowed imbalance based on load */ + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + /* * When comparing groups across NUMA domains, it's possible for * the local domain to be very lightly loaded relative to the @@ -8867,7 +8908,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * a real need of migration, periodic load balance will * take care of it. */ - if (local_sgs.idle_cpus) + if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) return NULL; } @@ -8969,16 +9010,19 @@ next_group: } } -static inline long adjust_numa_imbalance(int imbalance, int nr_running) +#define NUMA_IMBALANCE_MIN 2 + +static inline long adjust_numa_imbalance(int imbalance, + int dst_running, int dst_weight) { - unsigned int imbalance_min; + if (!allow_numa_imbalance(dst_running, dst_weight)) + return imbalance; /* * Allow a small imbalance based on a simple pair of communicating - * tasks that remain local when the source domain is almost idle. + * tasks that remain local when the destination is lightly loaded. */ - imbalance_min = 2; - if (nr_running <= imbalance_min) + if (imbalance <= NUMA_IMBALANCE_MIN) return 0; return imbalance; @@ -9031,7 +9075,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * emptying busiest. */ if (local->group_type == group_has_spare) { - if (busiest->group_type > group_fully_busy) { + if ((busiest->group_type > group_fully_busy) && + !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity @@ -9080,9 +9125,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* Consider allowing a small imbalance between NUMA groups */ - if (env->sd->flags & SD_NUMA) + if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - busiest->sum_nr_running); + busiest->sum_nr_running, busiest->group_weight); + } return; } @@ -9327,8 +9373,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - capacity = capacity_of(i); nr_running = rq->cfs.h_nr_running; + if (!nr_running) + continue; + + capacity = capacity_of(i); /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -9434,13 +9483,32 @@ asym_active_balance(struct lb_env *env) } static inline bool -voluntary_active_balance(struct lb_env *env) +imbalanced_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + /* + * The imbalanced case includes the case of pinned tasks preventing a fair + * distribution of the load on the system but also the even distribution of the + * threads on a system with spare capacity + */ + if ((env->migration_type == migrate_task) && + (sd->nr_balance_failed > sd->cache_nice_tries+2)) + return 1; + + return 0; +} + +static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; if (asym_active_balance(env)) return 1; + if (imbalanced_active_balance(env)) + return 1; + /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. * It's worth migrating the task if the src_cpu's capacity is reduced @@ -9460,16 +9528,6 @@ voluntary_active_balance(struct lb_env *env) return 0; } -static int need_active_balance(struct lb_env *env) -{ - struct sched_domain *sd = env->sd; - - if (voluntary_active_balance(env)) - return 1; - - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); -} - static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) @@ -9561,6 +9619,8 @@ redo: env.src_rq = busiest; ld_moved = 0; + /* Clear this flag as soon as we find a pullable task */ + env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -9568,7 +9628,6 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - env.flags |= LBF_ALL_PINNED; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -9694,10 +9753,12 @@ more_balance: if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } + /* Record that we found at least one task that could run on this_cpu */ + env.flags &= ~LBF_ALL_PINNED; + /* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared @@ -9719,21 +9780,13 @@ more_balance: /* We've kicked active balancing, force task migration. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } - } else + } else { sd->nr_balance_failed = 0; + } - if (likely(!active_balance) || voluntary_active_balance(&env)) { + if (likely(!active_balance) || need_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * detach_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; } goto out; @@ -10047,6 +10100,10 @@ static inline int find_new_ilb(void) for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { + + if (ilb == smp_processor_id()) + continue; + if (idle_cpu(ilb)) return ilb; } @@ -10484,7 +10541,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * idle_balance is called by schedule() if this_cpu is about to become + * newidle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -10634,8 +10691,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* + * Don't need to rebalance while attached to NULL domain or + * runqueue CPU is not active + */ + if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq)))) return; if (time_after_eq(jiffies, rq->next_balance)) @@ -10740,7 +10800,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (rq->curr == p) { + if (task_current(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else @@ -10873,7 +10933,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); else check_preempt_curr(rq, p, 0); @@ -11158,8 +11218,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class - __attribute__((section("__fair_sched_class"))) = { +DEFINE_SCHED_CLASS(fair) = { + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, |