aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c474
1 files changed, 284 insertions, 190 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c95880e216f6..5c09ddf8c832 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
*
@@ -513,6 +514,7 @@ static inline int entity_before(struct sched_entity *a,
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
+ struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
@@ -523,10 +525,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (cfs_rq->rb_leftmost) {
- struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
- struct sched_entity,
- run_node);
+ if (leftmost) { /* non-empty tree */
+ struct sched_entity *se;
+ se = rb_entry(leftmost, struct sched_entity, run_node);
if (!curr)
vruntime = se->vruntime;
@@ -547,10 +548,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+ struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
- int leftmost = 1;
+ bool leftmost = true;
/*
* Find the right place in the rbtree:
@@ -566,36 +567,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- /*
- * Maintain a cache of leftmost tree entries (it is frequently
- * used):
- */
- if (leftmost)
- cfs_rq->rb_leftmost = &se->run_node;
-
rb_link_node(&se->run_node, parent, link);
- rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_insert_color_cached(&se->run_node,
+ &cfs_rq->tasks_timeline, leftmost);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->rb_leftmost == &se->run_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&se->run_node);
- cfs_rq->rb_leftmost = next_node;
- }
-
- rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *left = cfs_rq->rb_leftmost;
+ struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
if (!left)
return NULL;
@@ -616,7 +604,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
+ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
if (!last)
return NULL;
@@ -806,7 +794,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
/*
* For !fair tasks do:
*
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
@@ -1071,6 +1059,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ int active_nodes;
+
+ struct rcu_head rcu;
+ unsigned long total_faults;
+ unsigned long max_faults_cpu;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
+ unsigned long faults[0];
+};
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
@@ -1107,13 +1118,47 @@ static unsigned int task_scan_min(struct task_struct *p)
return max_t(unsigned int, floor, scan);
}
+static unsigned int task_scan_start(struct task_struct *p)
+{
+ unsigned long smin = task_scan_min(p);
+ unsigned long period = smin;
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+ }
+
+ return max(smin, period);
+}
+
static unsigned int task_scan_max(struct task_struct *p)
{
- unsigned int smin = task_scan_min(p);
- unsigned int smax;
+ unsigned long smin = task_scan_min(p);
+ unsigned long smax;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+ unsigned long period = smax;
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+
+ smax = max(smax, period);
+ }
+
return max(smin, smax);
}
@@ -1129,26 +1174,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
-struct numa_group {
- atomic_t refcount;
-
- spinlock_t lock; /* nr_tasks, tasks */
- int nr_tasks;
- pid_t gid;
- int active_nodes;
-
- struct rcu_head rcu;
- unsigned long total_faults;
- unsigned long max_faults_cpu;
- /*
- * Faults_cpu is used to decide whether memory should move
- * towards the CPU. As a consequence, these stats are weighted
- * more by CPU use than by memory faults.
- */
- unsigned long *faults_cpu;
- unsigned long faults[0];
-};
-
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2
@@ -1198,6 +1223,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
+static inline unsigned long group_faults_priv(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+
+ return faults;
+}
+
+static inline unsigned long group_faults_shared(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
+ }
+
+ return faults;
+}
+
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
@@ -1378,7 +1427,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long weighted_cpuload(const int cpu);
+static unsigned long weighted_cpuload(struct rq *rq);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long capacity_of(int cpu);
@@ -1409,7 +1458,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
struct rq *rq = cpu_rq(cpu);
ns->nr_running += rq->nr_running;
- ns->load += weighted_cpuload(cpu);
+ ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu);
cpus++;
@@ -1808,7 +1857,7 @@ static int task_numa_migrate(struct task_struct *p)
* Reset the scan period if the task is being rescheduled on an
* alternative node to recheck if the tasks is now properly placed.
*/
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
@@ -1892,7 +1941,7 @@ static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
unsigned int period_slot;
- int ratio;
+ int lr_ratio, ps_ratio;
int diff;
unsigned long remote = p->numa_faults_locality[0];
@@ -1922,25 +1971,36 @@ static void update_task_scan_period(struct task_struct *p,
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
- ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
- if (ratio >= NUMA_PERIOD_THRESHOLD) {
- int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+ if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are local. There is no need to
+ * do fast NUMA scanning, since memory is already local.
+ */
+ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are shared with other tasks.
+ * There is no point in continuing fast NUMA scanning,
+ * since other tasks may just move the memory elsewhere.
+ */
+ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
- diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
/*
- * Scale scan rate increases based on sharing. There is an
- * inverse relationship between the degree of sharing and
- * the adjustment made to the scanning period. Broadly
- * speaking the intent is that there is little point
- * scanning faster if shared accesses dominate as it may
- * simply bounce migrations uselessly
+ * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+ * yet they are not on the local NUMA node. Speed up
+ * NUMA scanning to get the memory moved over.
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
- diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ int ratio = max(lr_ratio, ps_ratio);
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
p->numa_scan_period = clamp(p->numa_scan_period + diff,
@@ -2448,7 +2508,7 @@ void task_numa_work(struct callback_head *work)
if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p);
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
@@ -2576,7 +2636,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
- curr->numa_scan_period = task_scan_min(curr);
+ curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
@@ -2586,59 +2646,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- struct numa_stats prev_load, this_load;
- s64 this_eff_load, prev_eff_load;
-
- update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
- update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- unsigned long current_load = task_h_load(current);
-
- if (this_load.load > current_load)
- this_load.load -= current_load;
- else
- this_load.load = 0;
- }
-
- /*
- * In low-load situations, where this_cpu's node is idle due to the
- * sync cause above having dropped this_load.load to 0, move the task.
- * Moving to an idle socket will not create a bad imbalance.
- *
- * Otherwise check if the nodes are near enough in load to allow this
- * task to be woken on this_cpu's node.
- */
- if (this_load.load > 0) {
- unsigned long task_load = task_h_load(p);
-
- this_eff_load = 100;
- this_eff_load *= prev_load.compute_capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_load.compute_capacity;
-
- this_eff_load *= this_load.load + task_load;
- prev_eff_load *= prev_load.load - task_load;
-
- return this_eff_load <= prev_eff_load;
- }
-
- return true;
-}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2652,14 +2659,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- return true;
-}
-#endif /* !SMP */
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -2790,6 +2789,31 @@ static inline void update_cfs_shares(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (&rq->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq, 0);
+ }
+}
+
#ifdef CONFIG_SMP
/*
* Approximate:
@@ -2968,6 +2992,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
sa->last_update_time += delta << 10;
/*
+ * running is a subset of runnable (weight) so running can't be set if
+ * runnable is clear. But there are some corner cases where the current
+ * se has been already dequeued but cfs_rq->curr still points to it.
+ * This means that weight will be 0 but not running for a sched_entity
+ * but also for a cfs_rq if the latter becomes idle. As an example,
+ * this happens during idle_balance() which calls
+ * update_blocked_averages()
+ */
+ if (!weight)
+ running = 0;
+
+ /*
* Now we know we crossed measurement unit boundaries. The *_avg
* accrues by two steps:
*
@@ -3276,29 +3312,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
-{
- if (&this_rq()->cfs == cfs_rq) {
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util().
- */
- cpufreq_update_util(rq_of(cfs_rq), 0);
- }
-}
-
/*
* Unsigned subtract and clamp on underflow.
*
@@ -3320,7 +3333,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
* @cfs_rq: cfs_rq to update
- * @update_freq: should we call cfs_rq_util_change() or will the call do so
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
* avg. The immediate corollary is that all (fair) tasks must be attached, see
@@ -3334,7 +3346,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* call update_tg_load_avg() when this function returns true.
*/
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
struct sched_avg *sa = &cfs_rq->avg;
int decayed, removed_load = 0, removed_util = 0;
@@ -3362,7 +3374,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (update_freq && (decayed || removed_util))
+ if (decayed || removed_util)
cfs_rq_util_change(cfs_rq);
return decayed || removed_load;
@@ -3390,7 +3402,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cpu, cfs_rq, se);
- decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
if (decayed && (flags & UPDATE_TG))
@@ -3534,7 +3546,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
#else /* CONFIG_SMP */
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
return 0;
}
@@ -3544,7 +3556,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
static inline void update_load_avg(struct sched_entity *se, int not_used1)
{
- cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+ cfs_rq_util_change(cfs_rq_of(se));
}
static inline void
@@ -4875,7 +4887,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* passed.
*/
if (p->in_iowait)
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
if (se->on_rq)
@@ -5125,9 +5137,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
}
/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(struct rq *rq)
{
- return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+ return cfs_rq_runnable_load_avg(&rq->cfs);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -5172,7 +5184,7 @@ static void cpu_load_update_idle(struct rq *this_rq)
/*
* bail if there's load or we're actually up-to-date.
*/
- if (weighted_cpuload(cpu_of(this_rq)))
+ if (weighted_cpuload(this_rq))
return;
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -5193,7 +5205,7 @@ void cpu_load_update_nohz_start(void)
* concurrently we'll exit nohz. And cpu_load write can race with
* cpu_load_update_idle() but both updater would be writing the same.
*/
- this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+ this_rq->cpu_load[0] = weighted_cpuload(this_rq);
}
/*
@@ -5209,7 +5221,7 @@ void cpu_load_update_nohz_stop(void)
if (curr_jiffies == this_rq->last_load_update_tick)
return;
- load = weighted_cpuload(cpu_of(this_rq));
+ load = weighted_cpuload(this_rq);
rq_lock(this_rq, &rf);
update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -5235,7 +5247,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
*/
void cpu_load_update_active(struct rq *this_rq)
{
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ unsigned long load = weighted_cpuload(this_rq);
if (tick_nohz_tick_stopped())
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -5253,7 +5265,7 @@ void cpu_load_update_active(struct rq *this_rq)
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5268,7 +5280,7 @@ static unsigned long source_load(int cpu, int type)
static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5290,7 +5302,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = weighted_cpuload(cpu);
+ unsigned long load_avg = weighted_cpuload(rq);
if (nr_running)
return load_avg / nr_running;
@@ -5345,20 +5357,77 @@ static int wake_wide(struct task_struct *p)
return 1;
}
+/*
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
+ *
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ * will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ * scheduling latency of the CPUs. This seems to work
+ * for the overloaded case.
+ */
+
+static bool
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ if (idle_cpu(this_cpu))
+ return true;
+
+ if (sync && cpu_rq(this_cpu)->nr_running == 1)
+ return true;
+
+ return false;
+}
+
+static bool
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ s64 this_eff_load, prev_eff_load;
+ unsigned long task_load;
+
+ this_eff_load = target_load(this_cpu, sd->wake_idx);
+ prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ if (current_load > this_eff_load)
+ return true;
+
+ this_eff_load -= current_load;
+ }
+
+ task_load = task_h_load(p);
+
+ this_eff_load += task_load;
+ if (sched_feat(WA_BIAS))
+ this_eff_load *= 100;
+ this_eff_load *= capacity_of(prev_cpu);
+
+ prev_eff_load -= task_load;
+ if (sched_feat(WA_BIAS))
+ prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
+
+ return this_eff_load <= prev_eff_load;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
bool affine = false;
- /*
- * Common case: CPUs are in the same socket, and select_idle_sibling()
- * will do its thing regardless of what we return:
- */
- if (cpus_share_cache(prev_cpu, this_cpu))
- affine = true;
- else
- affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_IDLE) && !affine)
+ affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
+
+ if (sched_feat(WA_WEIGHT) && !affine)
+ affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@@ -5550,7 +5619,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(i);
+ load = weighted_cpuload(cpu_rq(i));
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
least_loaded_cpu = i;
@@ -6187,10 +6256,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
-#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
goto idle;
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (prev->sched_class != &fair_sched_class)
goto simple;
@@ -6220,11 +6289,17 @@ again:
/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
- * Therefore the 'simple' nr_running test will indeed
+ * Therefore the nr_running test will indeed
* be correct.
*/
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+ cfs_rq = &rq->cfs;
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
goto simple;
+ }
}
se = pick_next_entity(cfs_rq, curr);
@@ -6264,12 +6339,8 @@ again:
return p;
simple:
- cfs_rq = &rq->cfs;
#endif
- if (!cfs_rq->nr_running)
- goto idle;
-
put_prev_task(rq, prev);
do {
@@ -6917,7 +6988,7 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
@@ -6990,7 +7061,7 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
rq_unlock_irqrestore(rq, &rf);
}
@@ -7036,6 +7107,7 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
+ unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
@@ -7055,6 +7127,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
+ .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
@@ -7363,7 +7436,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
- sgs->sum_weighted_load += weighted_cpuload(i);
+ sgs->sum_weighted_load += weighted_cpuload(rq);
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -7546,6 +7619,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
+ sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
@@ -7560,7 +7634,6 @@ next_group:
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
}
-
}
/**
@@ -7581,7 +7654,7 @@ next_group:
* number.
*
* Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in *imbalance.
+ * this CPU. The amount of the imbalance is returned in env->imbalance.
*
* @env: The load balancing environment.
* @sds: Statistics of the sched_domain which is to be packed
@@ -7790,6 +7863,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
+ /* XXX broken for overlapping NUMA groups */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
@@ -7892,7 +7966,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
capacity = capacity_of(i);
- wl = weighted_cpuload(i);
+ wl = weighted_cpuload(rq);
/*
* When comparing with imbalance, use weighted_cpuload()
@@ -7970,6 +8044,13 @@ static int should_we_balance(struct lb_env *env)
int cpu, balance_cpu = -1;
/*
+ * Ensure the balancing environment is consistent; can happen
+ * when the softirq triggers 'during' hotplug.
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+ return 0;
+
+ /*
* In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
@@ -8309,6 +8390,12 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
this_rq->idle_stamp = rq_clock(this_rq);
/*
+ * Do not pull tasks towards !active CPUs...
+ */
+ if (!cpu_active(this_cpu))
+ return 0;
+
+ /*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
@@ -8415,6 +8502,13 @@ static int active_load_balance_cpu_stop(void *data)
struct rq_flags rf;
rq_lock_irq(busiest_rq, &rf);
+ /*
+ * Between queueing the stop-work and running it is a hole in which
+ * CPUs can become inactive. We should not move tasks from or to
+ * inactive CPUs.
+ */
+ if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
+ goto out_unlock;
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -9171,7 +9265,7 @@ static void set_curr_task_fair(struct rq *rq)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
- cfs_rq->tasks_timeline = RB_ROOT;
+ cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;