aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c205
1 files changed, 97 insertions, 108 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e476f6d9435..5146163bfabb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1502,7 +1502,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
-static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight);
@@ -1569,7 +1568,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
- ns->util += cpu_util(cpu);
+ ns->util += cpu_util_cfs(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
@@ -3029,9 +3028,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u32 divider = get_pelt_divider(&se->avg);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
#else
static inline void
@@ -3240,7 +3241,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
- * See cpu_util().
+ * See cpu_util_cfs().
*/
cpufreq_update_util(rq, flags);
}
@@ -3382,7 +3383,6 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
-
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
@@ -3450,15 +3450,14 @@ void set_task_rq_fair(struct sched_entity *se,
* XXX: only do this for the part of runnable > running ?
*
*/
-
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
- u32 divider;
+ long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ u32 new_sum, divider;
/* Nothing to update */
- if (!delta)
+ if (!delta_avg)
return;
/*
@@ -3467,23 +3466,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * divider;
+ new_sum = se->avg.util_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.util_sum;
+ se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ add_positive(&cfs_rq->avg.util_avg, delta_avg);
+ add_positive(&cfs_rq->avg.util_sum, delta_sum);
+
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
- u32 divider;
+ long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ u32 new_sum, divider;
/* Nothing to update */
- if (!delta)
+ if (!delta_avg)
return;
/*
@@ -3494,19 +3500,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
- se->avg.runnable_sum = se->avg.runnable_avg * divider;
+ new_sum = se->avg.runnable_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
+ se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg;
u64 load_sum = 0;
+ s64 delta_sum;
u32 divider;
if (!runnable_sum)
@@ -3533,7 +3545,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* assuming all tasks are equally runnable.
*/
if (scale_load_down(gcfs_rq->load.weight)) {
- load_sum = div_s64(gcfs_rq->avg.load_sum,
+ load_sum = div_u64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight));
}
@@ -3550,19 +3562,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
- load_sum = (s64)se_weight(se) * runnable_sum;
- load_avg = div_s64(load_sum, divider);
+ load_sum = se_weight(se) * runnable_sum;
+ load_avg = div_u64(load_sum, divider);
- se->avg.load_sum = runnable_sum;
-
- delta = load_avg - se->avg.load_avg;
- if (!delta)
+ delta_avg = load_avg - se->avg.load_avg;
+ if (!delta_avg)
return;
- se->avg.load_avg = load_avg;
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
- add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+ se->avg.load_sum = runnable_sum;
+ se->avg.load_avg = load_avg;
+ add_positive(&cfs_rq->avg.load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.load_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3653,7 +3668,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
- * Returns true if the load decayed or we removed load.
+ * Return: true if the load decayed or we removed load.
*
* Since both these conditions indicate a changed cfs_rq->avg.load we should
* call update_tg_load_avg() when this function returns true.
@@ -3678,15 +3693,32 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_load;
sub_positive(&sa->load_avg, r);
- sa->load_sum = sa->load_avg * divider;
+ sub_positive(&sa->load_sum, r * divider);
+ /* See sa->util_sum below */
+ sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
r = removed_util;
sub_positive(&sa->util_avg, r);
- sa->util_sum = sa->util_avg * divider;
+ sub_positive(&sa->util_sum, r * divider);
+ /*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
- sa->runnable_sum = sa->runnable_avg * divider;
+ sub_positive(&sa->runnable_sum, r * divider);
+ /* See sa->util_sum above */
+ sa->runnable_sum = max_t(u32, sa->runnable_sum,
+ sa->runnable_avg * PELT_MIN_DIVIDER);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -3773,17 +3805,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- /*
- * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
- * See ___update_load_avg() for details.
- */
- u32 divider = get_pelt_divider(&cfs_rq->avg);
-
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -4070,7 +4103,8 @@ done:
trace_sched_util_est_se_tp(&p->se);
}
-static inline int task_fits_capacity(struct task_struct *p, long capacity)
+static inline int task_fits_capacity(struct task_struct *p,
+ unsigned long capacity)
{
return fits_capacity(uclamp_task_util(p), capacity);
}
@@ -5509,11 +5543,9 @@ static inline void hrtick_update(struct rq *rq)
#endif
#ifdef CONFIG_SMP
-static inline unsigned long cpu_util(int cpu);
-
static inline bool cpu_overutilized(int cpu)
{
- return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
+ return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
}
static inline void update_overutilized_status(struct rq *rq)
@@ -6345,7 +6377,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}
-static inline bool asym_fits_capacity(int task_util, int cpu)
+static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
{
if (static_branch_unlikely(&sched_asym_cpucapacity))
return fits_capacity(task_util, capacity_of(cpu));
@@ -6398,8 +6430,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* pattern is IO completions.
*/
if (is_per_cpu_kthread(current) &&
+ in_task() &&
prev == smp_processor_id() &&
- this_rq()->nr_running <= 1) {
+ this_rq()->nr_running <= 1 &&
+ asym_fits_capacity(task_util, prev)) {
return prev;
}
@@ -6456,58 +6490,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
-/**
- * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks.
- * @cpu: the CPU to get the utilization of
- *
- * The unit of the return value must be the one of capacity so we can compare
- * the utilization with the capacity of the CPU that is available for CFS task
- * (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * The estimated utilization of a CPU is defined to be the maximum between its
- * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
- * currently RUNNABLE on that CPU.
- * This allows to properly represent the expected utilization of a CPU which
- * has just got a big task running since a long sleep period. At the same time
- * however it preserves the benefits of the "blocked utilization" in
- * describing the potential for other tasks waking up on the same CPU.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- *
- * Return: the (estimated) utilization for the specified CPU
- */
-static inline unsigned long cpu_util(int cpu)
-{
- struct cfs_rq *cfs_rq;
- unsigned int util;
-
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
-
- if (sched_feat(UTIL_EST))
- util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
-
- return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
@@ -6528,7 +6510,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util(cpu);
+ return cpu_util_cfs(cpu);
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
@@ -6592,7 +6574,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
* clamp to the maximum CPU capacity to ensure consistency with
- * the cpu_util call.
+ * cpu_util.
*/
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
@@ -6624,7 +6606,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be
- * cpu_util() after the task has been enqueued.
+ * cpu_util after the task has been enqueued.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
@@ -6915,6 +6897,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
break;
}
+ /*
+ * Usually only true for WF_EXEC and WF_FORK, as sched_domains
+ * usually do not have SD_BALANCE_WAKE set. That means wakeup
+ * will usually go to the fast path.
+ */
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
@@ -8586,6 +8573,8 @@ group_type group_classify(unsigned int imbalance_pct,
*
* If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
* of @dst_cpu are idle and @sg has lower priority.
+ *
+ * Return: true if @dst_cpu can pull tasks, false otherwise.
*/
static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
struct sg_lb_stats *sgs,
@@ -8661,6 +8650,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
+ * @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_status: Holds flag indicating the status of the sched_group
@@ -8681,7 +8671,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct rq *rq = cpu_rq(i);
sgs->group_load += cpu_load(rq);
- sgs->group_util += cpu_util(i);
+ sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
@@ -9468,12 +9458,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
+ * @env: The load balancing environment.
*
* Also calculates the amount of runnable load which should be moved
* to restore balance.
*
- * @env: The load balancing environment.
- *
* Return: - The busiest group if imbalance exists.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
@@ -9699,7 +9688,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
- util = cpu_util(cpu_of(rq));
+ util = cpu_util_cfs(i);
/*
* Don't try to pull utilization from a CPU with one
@@ -11068,7 +11057,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
- if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+ if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}