aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/auto_group.c4
-rw-r--r--kernel/sched/core.c37
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c119
-rw-r--r--kernel/sched/cputime.c124
-rw-r--r--kernel/sched/deadline.c4
-rw-r--r--kernel/sched/fair.c665
-rw-r--r--kernel/sched/idle.c175
-rw-r--r--kernel/sched/sched.h11
9 files changed, 805 insertions, 336 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index f1c8fd566246..da39489d2d80 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -212,6 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
{
static unsigned long next = INITIAL_JIFFIES;
struct autogroup *ag;
+ unsigned long shares;
int err;
if (nice < MIN_NICE || nice > MAX_NICE)
@@ -230,9 +231,10 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
next = HZ / 10 + jiffies;
ag = autogroup_task_get(p);
+ shares = scale_load(sched_prio_to_weight[nice + 20]);
down_write(&ag->lock);
- err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
+ err = sched_group_set_shares(ag->tg, shares);
if (!err)
ag->nice = nice;
up_write(&ag->lock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 154fd689fe02..966556ebdbb3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -75,11 +75,11 @@
#include <linux/compiler.h>
#include <linux/frame.h>
#include <linux/prefetch.h>
+#include <linux/mutex.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
-#include <asm/mutex.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
@@ -1995,14 +1995,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- * re-schedule is in progress), and as such you're allowed to do
- * the simpler "current->state = TASK_RUNNING" to mark yourself
- * runnable without the overhead of this.
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
*
- * Return: %true if @p was woken up, %false if it was already running.
- * or @state didn't match @p's state.
+ * If the task was not queued/runnable, also place it back on a runqueue.
+ *
+ * Atomic against schedule() which would dequeue a task, also see
+ * set_current_state().
+ *
+ * Return: %true if @p->state changes (an actual wakeup was done),
+ * %false otherwise.
*/
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
@@ -5279,6 +5280,7 @@ void init_idle(struct task_struct *idle, int cpu)
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
+ idle->flags |= PF_IDLE;
kasan_unpoison_task_stack(idle);
@@ -5707,7 +5709,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_CONT " %*pbl",
cpumask_pr_args(sched_group_cpus(group)));
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %d)",
+ printk(KERN_CONT " (cpu_capacity = %lu)",
group->sgc->capacity);
}
@@ -6184,6 +6186,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
/*
* Make sure the first group of this domain contains the
@@ -6301,7 +6304,22 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
WARN_ON(!sg);
do {
+ int cpu, max_cpu = -1;
+
sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ goto next;
+
+ for_each_cpu(cpu, sched_group_cpus(sg)) {
+ if (max_cpu < 0)
+ max_cpu = cpu;
+ else if (sched_asym_prefer(cpu, max_cpu))
+ max_cpu = cpu;
+ }
+ sg->asym_prefer_cpu = max_cpu;
+
+next:
sg = sg->next;
} while (sg != sd->groups);
@@ -7602,6 +7620,7 @@ void __init sched_init(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
* How much cpu bandwidth does root_task_group get?
*
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index bc0b309c3f19..9add206b5608 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
seq_printf(sf, "%s %lld\n",
cpuacct_stat_desc[stat],
- cputime64_to_clock_t(val[stat]));
+ (long long)cputime64_to_clock_t(val[stat]));
}
return 0;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 69e06898997d..fd4659313640 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,11 +12,14 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpufreq.h>
+#include <linux/kthread.h>
#include <linux/slab.h>
#include <trace/events/power.h>
#include "sched.h"
+#define SUGOV_KTHREAD_PRIORITY 50
+
struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
@@ -35,8 +38,10 @@ struct sugov_policy {
/* The next fields are only needed if fast switch cannot be used. */
struct irq_work irq_work;
- struct work_struct work;
+ struct kthread_work work;
struct mutex work_lock;
+ struct kthread_worker worker;
+ struct task_struct *thread;
bool work_in_progress;
bool need_freq_update;
@@ -291,7 +296,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
raw_spin_unlock(&sg_policy->update_lock);
}
-static void sugov_work(struct work_struct *work)
+static void sugov_work(struct kthread_work *work)
{
struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
@@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work)
struct sugov_policy *sg_policy;
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- schedule_work_on(smp_processor_id(), &sg_policy->work);
+
+ /*
+ * For RT and deadline tasks, the schedutil governor shoots the
+ * frequency to maximum. Special care must be taken to ensure that this
+ * kthread doesn't result in the same behavior.
+ *
+ * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+ * updated only at the end of the sugov_work() function and before that
+ * the schedutil governor rejects all other frequency scaling requests.
+ *
+ * There is a very rare case though, where the RT thread yields right
+ * after the work_in_progress flag is cleared. The effects of that are
+ * neglected for now.
+ */
+ kthread_queue_work(&sg_policy->worker, &sg_policy->work);
}
/************************** sysfs interface ************************/
@@ -371,19 +390,64 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
return NULL;
sg_policy->policy = policy;
- init_irq_work(&sg_policy->irq_work, sugov_irq_work);
- INIT_WORK(&sg_policy->work, sugov_work);
- mutex_init(&sg_policy->work_lock);
raw_spin_lock_init(&sg_policy->update_lock);
return sg_policy;
}
static void sugov_policy_free(struct sugov_policy *sg_policy)
{
- mutex_destroy(&sg_policy->work_lock);
kfree(sg_policy);
}
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+ struct task_struct *thread;
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+ struct cpufreq_policy *policy = sg_policy->policy;
+ int ret;
+
+ /* kthread only required for slow path */
+ if (policy->fast_switch_enabled)
+ return 0;
+
+ kthread_init_work(&sg_policy->work, sugov_work);
+ kthread_init_worker(&sg_policy->worker);
+ thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+ "sugov:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR(thread)) {
+ pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ return PTR_ERR(thread);
+ }
+
+ ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+ if (ret) {
+ kthread_stop(thread);
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ return ret;
+ }
+
+ sg_policy->thread = thread;
+ kthread_bind_mask(thread, policy->related_cpus);
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ mutex_init(&sg_policy->work_lock);
+
+ wake_up_process(thread);
+
+ return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+ /* kthread only required for slow path */
+ if (sg_policy->policy->fast_switch_enabled)
+ return;
+
+ kthread_flush_worker(&sg_policy->worker);
+ kthread_stop(sg_policy->thread);
+ mutex_destroy(&sg_policy->work_lock);
+}
+
static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
{
struct sugov_tunables *tunables;
@@ -416,16 +480,24 @@ static int sugov_init(struct cpufreq_policy *policy)
if (policy->governor_data)
return -EBUSY;
+ cpufreq_enable_fast_switch(policy);
+
sg_policy = sugov_policy_alloc(policy);
- if (!sg_policy)
- return -ENOMEM;
+ if (!sg_policy) {
+ ret = -ENOMEM;
+ goto disable_fast_switch;
+ }
+
+ ret = sugov_kthread_create(sg_policy);
+ if (ret)
+ goto free_sg_policy;
mutex_lock(&global_tunables_lock);
if (global_tunables) {
if (WARN_ON(have_governor_per_policy())) {
ret = -EINVAL;
- goto free_sg_policy;
+ goto stop_kthread;
}
policy->governor_data = sg_policy;
sg_policy->tunables = global_tunables;
@@ -437,7 +509,7 @@ static int sugov_init(struct cpufreq_policy *policy)
tunables = sugov_tunables_alloc(sg_policy);
if (!tunables) {
ret = -ENOMEM;
- goto free_sg_policy;
+ goto stop_kthread;
}
tunables->rate_limit_us = LATENCY_MULTIPLIER;
@@ -454,20 +526,25 @@ static int sugov_init(struct cpufreq_policy *policy)
if (ret)
goto fail;
- out:
+out:
mutex_unlock(&global_tunables_lock);
-
- cpufreq_enable_fast_switch(policy);
return 0;
- fail:
+fail:
policy->governor_data = NULL;
sugov_tunables_free(tunables);
- free_sg_policy:
+stop_kthread:
+ sugov_kthread_stop(sg_policy);
+
+free_sg_policy:
mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+ cpufreq_disable_fast_switch(policy);
+
pr_err("initialization failed (error %d)\n", ret);
return ret;
}
@@ -478,8 +555,6 @@ static void sugov_exit(struct cpufreq_policy *policy)
struct sugov_tunables *tunables = sg_policy->tunables;
unsigned int count;
- cpufreq_disable_fast_switch(policy);
-
mutex_lock(&global_tunables_lock);
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
@@ -489,7 +564,9 @@ static void sugov_exit(struct cpufreq_policy *policy)
mutex_unlock(&global_tunables_lock);
+ sugov_kthread_stop(sg_policy);
sugov_policy_free(sg_policy);
+ cpufreq_disable_fast_switch(policy);
}
static int sugov_start(struct cpufreq_policy *policy)
@@ -535,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy)
synchronize_sched();
- irq_work_sync(&sg_policy->irq_work);
- cancel_work_sync(&sg_policy->work);
+ if (!policy->fast_switch_enabled) {
+ irq_work_sync(&sg_policy->irq_work);
+ kthread_cancel_work_sync(&sg_policy->work);
+ }
}
static void sugov_limits(struct cpufreq_policy *policy)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5ebee3164e64..7700a9cba335 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -128,16 +128,13 @@ static inline void task_group_account_field(struct task_struct *p, int index,
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in user space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
*/
-void account_user_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
+void account_user_time(struct task_struct *p, cputime_t cputime)
{
int index;
/* Add user time to process. */
p->utime += cputime;
- p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
@@ -153,16 +150,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
* Account guest cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
*/
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
+static void account_guest_time(struct task_struct *p, cputime_t cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
/* Add guest time to process. */
p->utime += cputime;
- p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
p->gtime += cputime;
@@ -180,16 +174,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
* Account system cpu time to a process and desired cpustat field
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
+ * @index: pointer to cpustat field that has to be updated
*/
static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, int index)
+void __account_system_time(struct task_struct *p, cputime_t cputime, int index)
{
/* Add system time to process. */
p->stime += cputime;
- p->stimescaled += cputime_scaled;
account_group_system_time(p, cputime);
/* Add system time to cpustat. */
@@ -204,15 +195,14 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
* @p: the process that the cpu time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
* @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
*/
void account_system_time(struct task_struct *p, int hardirq_offset,
- cputime_t cputime, cputime_t cputime_scaled)
+ cputime_t cputime)
{
int index;
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
- account_guest_time(p, cputime, cputime_scaled);
+ account_guest_time(p, cputime);
return;
}
@@ -223,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
else
index = CPUTIME_SYSTEM;
- __account_system_time(p, cputime, cputime_scaled, index);
+ __account_system_time(p, cputime, index);
}
/*
@@ -390,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq, int ticks)
{
u64 cputime = (__force u64) cputime_one_jiffy * ticks;
- cputime_t scaled, other;
+ cputime_t other;
/*
* When returning from idle, many ticks can get accounted at
@@ -403,7 +393,6 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
if (other >= cputime)
return;
cputime -= other;
- scaled = cputime_to_scaled(cputime);
if (this_cpu_ksoftirqd() == p) {
/*
@@ -411,15 +400,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
+ __account_system_time(p, cputime, CPUTIME_SOFTIRQ);
} else if (user_tick) {
- account_user_time(p, cputime, scaled);
+ account_user_time(p, cputime);
} else if (p == rq->idle) {
account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime, scaled);
+ account_guest_time(p, cputime);
} else {
- __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
+ __account_system_time(p, cputime, CPUTIME_SYSTEM);
}
}
@@ -502,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
- cputime_t cputime, scaled, steal;
+ cputime_t cputime, steal;
struct rq *rq = this_rq();
if (vtime_accounting_cpu_enabled())
@@ -520,12 +509,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
cputime -= steal;
- scaled = cputime_to_scaled(cputime);
if (user_tick)
- account_user_time(p, cputime, scaled);
+ account_user_time(p, cputime);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
+ account_system_time(p, HARDIRQ_OFFSET, cputime);
else
account_idle_time(cputime);
}
@@ -746,7 +734,7 @@ static void __vtime_account_system(struct task_struct *tsk)
{
cputime_t delta_cpu = get_vtime_delta(tsk);
- account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+ account_system_time(tsk, irq_count(), delta_cpu);
}
void vtime_account_system(struct task_struct *tsk)
@@ -767,7 +755,7 @@ void vtime_account_user(struct task_struct *tsk)
tsk->vtime_snap_whence = VTIME_SYS;
if (vtime_delta(tsk)) {
delta_cpu = get_vtime_delta(tsk);
- account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+ account_user_time(tsk, delta_cpu);
}
write_seqcount_end(&tsk->vtime_seqcount);
}
@@ -863,29 +851,25 @@ cputime_t task_gtime(struct task_struct *t)
* add up the pending nohz execution time since the last
* cputime snapshot.
*/
-static void
-fetch_task_cputime(struct task_struct *t,
- cputime_t *u_dst, cputime_t *s_dst,
- cputime_t *u_src, cputime_t *s_src,
- cputime_t *udelta, cputime_t *sdelta)
+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
{
+ cputime_t delta;
unsigned int seq;
- unsigned long long delta;
- do {
- *udelta = 0;
- *sdelta = 0;
+ if (!vtime_accounting_enabled()) {
+ *utime = t->utime;
+ *stime = t->stime;
+ return;
+ }
+ do {
seq = read_seqcount_begin(&t->vtime_seqcount);
- if (u_dst)
- *u_dst = *u_src;
- if (s_dst)
- *s_dst = *s_src;
+ *utime = t->utime;
+ *stime = t->stime;
/* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_INACTIVE ||
- is_idle_task(t))
+ if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
continue;
delta = vtime_delta(t);
@@ -894,54 +878,10 @@ fetch_task_cputime(struct task_struct *t,
* Task runs either in user or kernel space, add pending nohz time to
* the right place.
*/
- if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
- *udelta = delta;
- } else {
- if (t->vtime_snap_whence == VTIME_SYS)
- *sdelta = delta;
- }
+ if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
+ *utime += delta;
+ else if (t->vtime_snap_whence == VTIME_SYS)
+ *stime += delta;
} while (read_seqcount_retry(&t->vtime_seqcount, seq));
}
-
-
-void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
-{
- cputime_t udelta, sdelta;
-
- if (!vtime_accounting_enabled()) {
- if (utime)
- *utime = t->utime;
- if (stime)
- *stime = t->stime;
- return;
- }
-
- fetch_task_cputime(t, utime, stime, &t->utime,
- &t->stime, &udelta, &sdelta);
- if (utime)
- *utime += udelta;
- if (stime)
- *stime += sdelta;
-}
-
-void task_cputime_scaled(struct task_struct *t,
- cputime_t *utimescaled, cputime_t *stimescaled)
-{
- cputime_t udelta, sdelta;
-
- if (!vtime_accounting_enabled()) {
- if (utimescaled)
- *utimescaled = t->utimescaled;
- if (stimescaled)
- *stimescaled = t->stimescaled;
- return;
- }
-
- fetch_task_cputime(t, utimescaled, stimescaled,
- &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
- if (utimescaled)
- *utimescaled += cputime_to_scaled(udelta);
- if (stimescaled)
- *stimescaled += cputime_to_scaled(sdelta);
-}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 37e2449186c4..70ef2b1901e4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -586,7 +586,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
/*
* The task might have changed its scheduling policy to something
- * different than SCHED_DEADLINE (through switched_fromd_dl()).
+ * different than SCHED_DEADLINE (through switched_from_dl()).
*/
if (!dl_task(p)) {
__dl_clear_params(p);
@@ -1137,7 +1137,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
pull_dl_task(rq);
lockdep_repin_lock(&rq->lock, cookie);
/*
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * pull_dl_task() can drop (and re-acquire) rq->lock; this
* means a stop task can slip in, in which case we need to
* re-start task selection.
*/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c242944f5cbd..6559d197e08a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -37,7 +37,6 @@
/*
* Targeted preemption latency for CPU-bound tasks:
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
@@ -46,31 +45,35 @@
*
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
+ *
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*
* Options are:
- * SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
- * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
-enum sched_tunable_scaling sysctl_sched_tunable_scaling
- = SCHED_TUNABLESCALING_LOG;
+enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
+ *
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
+ * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
*/
static unsigned int sched_nr_latency = 8;
@@ -82,23 +85,27 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
* SCHED_OTHER wake-up granularity.
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
+ *
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+#ifdef CONFIG_SMP
/*
- * The exponential sliding window over which load is averaged for shares
- * distribution.
- * (default: 10msec)
+ * For asym packing, by default the lower numbered cpu has higher priority.
*/
-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+int __weak arch_asym_cpu_priority(int cpu)
+{
+ return -cpu;
+}
+#endif
#ifdef CONFIG_CFS_BANDWIDTH
/*
@@ -109,16 +116,18 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
* to consumption or the quota being specified to be smaller than the slice)
* we will always only issue the remaining available time.
*
- * default: 5 msec, units: microseconds
- */
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+ * (default: 5 msec, units: microseconds)
+ */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
/*
* The margin used when comparing utilization with CPU capacity:
- * util * 1024 < capacity * margin
+ * util * margin < capacity * 1024
+ *
+ * (default: ~20%)
*/
-unsigned int capacity_margin = 1280; /* ~20% */
+unsigned int capacity_margin = 1280;
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
@@ -290,19 +299,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases.
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the beg of the tree
*/
if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- } else {
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+ /*
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
+ */
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the beg of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else {
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the beg of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new beg
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
}
cfs_rq->on_list = 1;
@@ -708,9 +757,7 @@ void init_entity_runnable_average(struct sched_entity *se)
}
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+static void attach_entity_cfs_rq(struct sched_entity *se);
/*
* With new tasks being created, their initial util_avgs are extrapolated
@@ -742,7 +789,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
- u64 now = cfs_rq_clock_task(cfs_rq);
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@@ -770,14 +816,12 @@ void post_init_entity_util_avg(struct sched_entity *se)
* such that the next switched_to_fair() has the
* expected state.
*/
- se->avg.last_update_time = now;
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
return;
}
}
- update_cfs_rq_load_avg(now, cfs_rq, false);
- attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ attach_entity_cfs_rq(se);
}
#else /* !CONFIG_SMP */
@@ -2890,6 +2934,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
return decayed;
}
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
* update_tg_load_avg - update the tg's load avg
@@ -2969,8 +3033,138 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
}
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta, load = gcfs_rq->avg.load_avg;
+
+ /*
+ * If the load of group cfs_rq is null, the load of the
+ * sched_entity will also be null so we can skip the formula
+ */
+ if (load) {
+ long tg_load;
+
+ /* Get tg's load and ensure tg_load > 0 */
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+ /* Ensure tg_load >= load and updated with current load*/
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
+ tg_load += load;
+
+ /*
+ * We need to compute a correction term in the case that the
+ * task group is consuming more CPU than a task of equal
+ * weight. A task with a weight equals to tg->shares will have
+ * a load less or equal to scale_load_down(tg->shares).
+ * Similarly, the sched_entities that represent the task group
+ * at parent level, can't have a load higher than
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
+ * load must be <= scale_load_down(tg->shares).
+ */
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+ /* scale gcfs_rq's load into tg's shares*/
+ load *= scale_load_down(gcfs_rq->tg->shares);
+ load /= tg_load;
+ }
+ }
+
+ delta = load - se->avg.load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's load */
+ se->avg.load_avg = load;
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+ /*
+ * If the sched_entity is already enqueued, we also have to update the
+ * runnable load avg.
+ */
+ if (se->on_rq) {
+ /* Update parent cfs_rq runnable_load_avg */
+ add_positive(&cfs_rq->runnable_load_avg, delta);
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+ if (!cfs_rq->propagate_avg)
+ return 0;
+
+ cfs_rq->propagate_avg = 0;
+ return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ if (entity_is_task(se))
+ return 0;
+
+ if (!test_and_clear_tg_cfs_propagate(se))
+ return 0;
+
+ cfs_rq = cfs_rq_of(se);
+
+ set_tg_cfs_propagate(cfs_rq);
+
+ update_tg_cfs_util(cfs_rq, se);
+ update_tg_cfs_load(cfs_rq, se);
+
+ return 1;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
+
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
@@ -3041,6 +3235,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed_load = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -3048,6 +3243,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
removed_util = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -3064,23 +3260,35 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
return decayed || removed_load;
}
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+
/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
+ int decayed;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+ __update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
+ }
- if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed |= propagate_entity_load_avg(se);
+
+ if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
}
@@ -3094,31 +3302,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (!sched_feat(ATTACH_AGE_LOAD))
- goto skip_aging;
-
- /*
- * If we got migrated (either between CPUs or between cgroups) we'll
- * have aged the average right before clearing @last_update_time.
- *
- * Or we're fresh through post_init_entity_util_avg().
- */
- if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, 0, 0, NULL);
-
- /*
- * XXX: we could have just aged the entire load away if we've been
- * absent from the fair class for too long.
- */
- }
-
-skip_aging:
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq);
}
@@ -3133,14 +3322,12 @@ skip_aging:
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq);
}
@@ -3150,34 +3337,20 @@ static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- u64 now = cfs_rq_clock_task(cfs_rq);
- int migrated, decayed;
-
- migrated = !sa->last_update_time;
- if (!migrated) {
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
- if (migrated)
+ if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
-
- if (decayed || migrated)
update_tg_load_avg(cfs_rq, 0);
+ }
}
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_avg(se, 1);
-
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
cfs_rq->runnable_load_sum =
@@ -3206,13 +3379,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
#endif
/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
+/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3224,9 +3409,7 @@ void remove_entity_load_avg(struct sched_entity *se)
* calls this.
*/
- last_update_time = cfs_rq_last_update_time(cfs_rq);
-
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ sync_entity_load_avg(se);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
@@ -3251,7 +3434,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
return 0;
}
-static inline void update_load_avg(struct sched_entity *se, int not_used)
+#define UPDATE_TG 0x0
+#define SKIP_AGE_LOAD 0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1)
{
cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
}
@@ -3396,6 +3582,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
+ update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -3470,6 +3657,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se, flags);
@@ -3557,7 +3745,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
@@ -3675,7 +3863,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, 1);
+ update_load_avg(curr, UPDATE_TG);
update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
@@ -4572,7 +4760,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
update_cfs_shares(cfs_rq);
}
@@ -4631,7 +4819,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
update_cfs_shares(cfs_rq);
}
@@ -5199,6 +5387,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return 1;
}
+static inline int task_util(struct task_struct *p);
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+ return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
@@ -5208,15 +5404,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
- unsigned long min_load = ULONG_MAX, this_load = 0;
+ struct sched_group *most_spare_sg = NULL;
+ unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+ unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
- int imbalance = 100 + (sd->imbalance_pct-100)/2;
+ int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+ unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load;
+ unsigned long load, avg_load, runnable_load;
+ unsigned long spare_cap, max_spare_cap;
int local_group;
int i;
@@ -5228,8 +5430,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
- /* Tally up the load of all CPUs in the group */
+ /*
+ * Tally up the load of all CPUs in the group and find
+ * the group containing the CPU with most spare capacity.
+ */
avg_load = 0;
+ runnable_load = 0;
+ max_spare_cap = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
@@ -5238,22 +5445,84 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
else
load = target_load(i, load_idx);
- avg_load += load;
+ runnable_load += load;
+
+ avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
+
+ spare_cap = capacity_spare_wake(i, p);
+
+ if (spare_cap > max_spare_cap)
+ max_spare_cap = spare_cap;
}
/* Adjust by relative CPU capacity of the group */
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
+ runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
if (local_group) {
- this_load = avg_load;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
+ this_runnable_load = runnable_load;
+ this_avg_load = avg_load;
+ this_spare = max_spare_cap;
+ } else {
+ if (min_runnable_load > (runnable_load + imbalance)) {
+ /*
+ * The runnable load is significantly smaller
+ * so we can pick this new cpu
+ */
+ min_runnable_load = runnable_load;
+ min_avg_load = avg_load;
+ idlest = group;
+ } else if ((runnable_load < (min_runnable_load + imbalance)) &&
+ (100*min_avg_load > imbalance_scale*avg_load)) {
+ /*
+ * The runnable loads are close so take the
+ * blocked load into account through avg_load.
+ */
+ min_avg_load = avg_load;
+ idlest = group;
+ }
+
+ if (most_spare < max_spare_cap) {
+ most_spare = max_spare_cap;
+ most_spare_sg = group;
+ }
}
} while (group = group->next, group != sd->groups);
- if (!idlest || 100*this_load < imbalance*min_load)
+ /*
+ * The cross-over point between using spare capacity or least load
+ * is too conservative for high utilization tasks on partially
+ * utilized systems if we require spare_capacity > task_util(p),
+ * so we allow for some task stuffing by using
+ * spare_capacity > task_util(p)/2.
+ *
+ * Spare capacity can't be used for fork because the utilization has
+ * not been set yet, we must first select a rq to compute the initial
+ * utilization.
+ */
+ if (sd_flag & SD_BALANCE_FORK)
+ goto skip_spare;
+
+ if (this_spare > task_util(p) / 2 &&
+ imbalance_scale*this_spare > 100*most_spare)
+ return NULL;
+
+ if (most_spare > task_util(p) / 2)
+ return most_spare_sg;
+
+skip_spare:
+ if (!idlest)
+ return NULL;
+
+ if (min_runnable_load > (this_runnable_load + imbalance))
return NULL;
+
+ if ((this_runnable_load < (min_runnable_load + imbalance)) &&
+ (100*this_avg_load < imbalance_scale*min_avg_load))
+ return NULL;
+
return idlest;
}
@@ -5590,6 +5859,24 @@ static inline int task_util(struct task_struct *p)
}
/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+ unsigned long util, capacity;
+
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ return cpu_util(cpu);
+
+ capacity = capacity_orig_of(cpu);
+ util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+
+ return (util >= capacity) ? capacity : util;
+}
+
+/*
* Disable WAKE_AFFINE in the case where task @p doesn't fit in the
* capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
*
@@ -5607,6 +5894,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
if (max_cap - min_cap < max_cap >> 3)
return 0;
+ /* Bring task utilization in sync with prev_cpu */
+ sync_entity_load_avg(&p->se);
+
return min_cap * 1024 < task_util(p) * capacity_margin;
}
@@ -6641,6 +6931,10 @@ static void update_blocked_averages(int cpu)
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
+
+ /* Propagate pending load changes to the parent */
+ if (cfs_rq->tg->se[cpu])
+ update_load_avg(cfs_rq->tg->se[cpu], 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6845,13 +7139,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity;
+ unsigned long capacity, min_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -6864,6 +7159,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
capacity = 0;
+ min_capacity = ULONG_MAX;
if (child->flags & SD_OVERLAP) {
/*
@@ -6888,11 +7184,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
*/
if (unlikely(!rq->sd)) {
capacity += capacity_of(cpu);
- continue;
+ } else {
+ sgc = rq->sd->groups->sgc;
+ capacity += sgc->capacity;
}
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
+ min_capacity = min(capacity, min_capacity);
}
} else {
/*
@@ -6902,12 +7199,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- capacity += group->sgc->capacity;
+ struct sched_group_capacity *sgc = group->sgc;
+
+ capacity += sgc->capacity;
+ min_capacity = min(sgc->min_capacity, min_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
+ sdg->sgc->min_capacity = min_capacity;
}
/*
@@ -6930,8 +7231,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
* Something like:
*
- * { 0 1 2 3 } { 4 5 6 7 }
- * * * * *
+ * { 0 1 2 3 } { 4 5 6 7 }
+ * * * * *
*
* If we were to balance group-wise we'd place two tasks in the first group and
* two tasks in the second group. Clearly this is undesired as it will overload
@@ -7002,6 +7303,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
return false;
}
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->min_capacity * capacity_margin <
+ ref->sgc->min_capacity * 1024;
+}
+
static inline enum
group_type group_classify(struct sched_group *group,
struct sg_lb_stats *sgs)
@@ -7105,6 +7417,20 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->avg_load <= busiest->avg_load)
return false;
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
+
+ /*
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+
+asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
@@ -7113,16 +7439,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (env->idle == CPU_NOT_IDLE)
return true;
/*
- * ASYM_PACKING needs to move all the work to the lowest
- * numbered CPUs in the group, therefore mark all groups
- * higher than ourself as busy.
+ * ASYM_PACKING needs to move all the work to the highest
+ * prority CPUs in the group, therefore mark all groups
+ * of lower priority than ourself as busy.
*/
- if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
+ if (sgs->sum_nr_running &&
+ sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
if (!sds->busiest)
return true;
- /* Prefer to move from highest possible cpu's work */
- if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
+ /* Prefer to move from lowest priority cpu's work */
+ if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
+ sg->asym_prefer_cpu))
return true;
}
@@ -7274,8 +7602,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
if (!sds->busiest)
return 0;
- busiest_cpu = group_first_cpu(sds->busiest);
- if (env->dst_cpu > busiest_cpu)
+ busiest_cpu = sds->busiest->asym_prefer_cpu;
+ if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
return 0;
env->imbalance = DIV_ROUND_CLOSEST(
@@ -7613,10 +7941,11 @@ static int need_active_balance(struct lb_env *env)
/*
* ASYM_PACKING needs to force migrate tasks from busy but
- * higher numbered CPUs in order to pack all tasks in the
- * lowest numbered CPUs.
+ * lower priority CPUs in order to pack all tasks in the
+ * highest priority CPUs.
*/
- if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
+ if ((sd->flags & SD_ASYM_PACKING) &&
+ sched_asym_prefer(env->dst_cpu, env->src_cpu))
return 1;
}
@@ -8465,7 +8794,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
unsigned long now = jiffies;
struct sched_domain_shared *sds;
struct sched_domain *sd;
- int nr_busy, cpu = rq->cpu;
+ int nr_busy, i, cpu = rq->cpu;
bool kick = false;
if (unlikely(rq->idle_balance))
@@ -8516,12 +8845,18 @@ static inline bool nohz_kick_needed(struct rq *rq)
}
sd = rcu_dereference(per_cpu(sd_asym, cpu));
- if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu)) {
- kick = true;
- goto unlock;
- }
+ if (sd) {
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (i == cpu ||
+ !cpumask_test_cpu(i, nohz.idle_cpus_mask))
+ continue;
+ if (sched_asym_prefer(i, cpu)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+ }
unlock:
rcu_read_unlock();
return kick;
@@ -8687,32 +9022,45 @@ static inline bool vruntime_normalized(struct task_struct *p)
return false;
}
-static void detach_task_cfs_rq(struct task_struct *p)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
+ struct cfs_rq *cfs_rq;
- if (!vruntime_normalized(p)) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
+ /* Start to propagate at parent */
+ se = se->parent;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(se, UPDATE_TG);
}
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
/* Catch up with the cfs_rq and remove our load when we leave */
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_load_avg(se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
}
-static void attach_task_cfs_rq(struct task_struct *p)
+static void attach_entity_cfs_rq(struct sched_entity *se)
{
- struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -8722,10 +9070,36 @@ static void attach_task_cfs_rq(struct task_struct *p)
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- /* Synchronize task with its cfs_rq */
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ /* Synchronize entity with its cfs_rq */
+ update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!vruntime_normalized(p)) {
+ /*
+ * Fix up our vruntime so that the current sleep doesn't
+ * cause 'unlimited' sleep bonus.
+ */
+ place_entity(cfs_rq, se, 0);
+ se->vruntime -= cfs_rq->min_vruntime;
+ }
+
+ detach_entity_cfs_rq(se);
+}
+
+static void attach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ attach_entity_cfs_rq(se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -8779,6 +9153,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->propagate_avg = 0;
+#endif
atomic_long_set(&cfs_rq->removed_load_avg, 0);
atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
@@ -8887,7 +9264,7 @@ void online_fair_sched_group(struct task_group *tg)
se = tg->se[i];
raw_spin_lock_irq(&rq->lock);
- post_init_entity_util_avg(se);
+ attach_entity_cfs_rq(se);
sync_throttle(tg, i);
raw_spin_unlock_irq(&rq->lock);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1d8718d5300d..6a4bae0a649d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -164,11 +164,14 @@ static void cpuidle_idle_call(void)
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
*/
- if (idle_should_freeze()) {
- entered_state = cpuidle_enter_freeze(drv, dev);
- if (entered_state > 0) {
- local_irq_enable();
- goto exit_idle;
+
+ if (idle_should_freeze() || dev->use_deepest_state) {
+ if (idle_should_freeze()) {
+ entered_state = cpuidle_enter_freeze(drv, dev);
+ if (entered_state > 0) {
+ local_irq_enable();
+ goto exit_idle;
+ }
}
next_state = cpuidle_find_deepest_state(drv, dev);
@@ -202,76 +205,65 @@ exit_idle:
*
* Called with polling cleared.
*/
-static void cpu_idle_loop(void)
+static void do_idle(void)
{
- int cpu = smp_processor_id();
-
- while (1) {
- /*
- * If the arch has a polling bit, we maintain an invariant:
- *
- * Our polling bit is clear if we're not scheduled (i.e. if
- * rq->curr != rq->idle). This means that, if rq->idle has
- * the polling bit set, then setting need_resched is
- * guaranteed to cause the cpu to reschedule.
- */
-
- __current_set_polling();
- quiet_vmstat();
- tick_nohz_idle_enter();
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
+ * rq->idle). This means that, if rq->idle has the polling bit set,
+ * then setting need_resched is guaranteed to cause the CPU to
+ * reschedule.
+ */
- while (!need_resched()) {
- check_pgt_cache();
- rmb();
+ __current_set_polling();
+ tick_nohz_idle_enter();
- if (cpu_is_offline(cpu)) {
- cpuhp_report_idle_dead();
- arch_cpu_idle_dead();
- }
+ while (!need_resched()) {
+ check_pgt_cache();
+ rmb();
- local_irq_disable();
- arch_cpu_idle_enter();
-
- /*
- * In poll mode we reenable interrupts and spin.
- *
- * Also if we detected in the wakeup from idle
- * path that the tick broadcast device expired
- * for us, we don't want to go deep idle as we
- * know that the IPI is going to arrive right
- * away
- */
- if (cpu_idle_force_poll || tick_check_broadcast_expired())
- cpu_idle_poll();
- else
- cpuidle_idle_call();
-
- arch_cpu_idle_exit();
+ if (cpu_is_offline(smp_processor_id())) {
+ cpuhp_report_idle_dead();
+ arch_cpu_idle_dead();
}
- /*
- * Since we fell out of the loop above, we know
- * TIF_NEED_RESCHED must be set, propagate it into
- * PREEMPT_NEED_RESCHED.
- *
- * This is required because for polling idle loops we will
- * not have had an IPI to fold the state for us.
- */
- preempt_set_need_resched();
- tick_nohz_idle_exit();
- __current_clr_polling();
+ local_irq_disable();
+ arch_cpu_idle_enter();
/*
- * We promise to call sched_ttwu_pending and reschedule
- * if need_resched is set while polling is set. That
- * means that clearing polling needs to be visible
- * before doing these things.
+ * In poll mode we reenable interrupts and spin. Also if we
+ * detected in the wakeup from idle path that the tick
+ * broadcast device expired for us, we don't want to go deep
+ * idle as we know that the IPI is going to arrive right away.
*/
- smp_mb__after_atomic();
-
- sched_ttwu_pending();
- schedule_preempt_disabled();
+ if (cpu_idle_force_poll || tick_check_broadcast_expired())
+ cpu_idle_poll();
+ else
+ cpuidle_idle_call();
+ arch_cpu_idle_exit();
}
+
+ /*
+ * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
+ * be set, propagate it into PREEMPT_NEED_RESCHED.
+ *
+ * This is required because for polling idle loops we will not have had
+ * an IPI to fold the state for us.
+ */
+ preempt_set_need_resched();
+ tick_nohz_idle_exit();
+ __current_clr_polling();
+
+ /*
+ * We promise to call sched_ttwu_pending() and reschedule if
+ * need_resched() is set while polling is set. That means that clearing
+ * polling needs to be visible before doing these things.
+ */
+ smp_mb__after_atomic();
+
+ sched_ttwu_pending();
+ schedule_preempt_disabled();
}
bool cpu_in_idle(unsigned long pc)
@@ -280,6 +272,56 @@ bool cpu_in_idle(unsigned long pc)
pc < (unsigned long)__cpuidle_text_end;
}
+struct idle_timer {
+ struct hrtimer timer;
+ int done;
+};
+
+static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
+{
+ struct idle_timer *it = container_of(timer, struct idle_timer, timer);
+
+ WRITE_ONCE(it->done, 1);
+ set_tsk_need_resched(current);
+
+ return HRTIMER_NORESTART;
+}
+
+void play_idle(unsigned long duration_ms)
+{
+ struct idle_timer it;
+
+ /*
+ * Only FIFO tasks can disable the tick since they don't need the forced
+ * preemption.
+ */
+ WARN_ON_ONCE(current->policy != SCHED_FIFO);
+ WARN_ON_ONCE(current->nr_cpus_allowed != 1);
+ WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+ WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
+ WARN_ON_ONCE(!duration_ms);
+
+ rcu_sleep_check();
+ preempt_disable();
+ current->flags |= PF_IDLE;
+ cpuidle_use_deepest_state(true);
+
+ it.done = 0;
+ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ it.timer.function = idle_inject_timer_fn;
+ hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
+
+ while (!READ_ONCE(it.done))
+ do_idle();
+
+ cpuidle_use_deepest_state(false);
+ current->flags &= ~PF_IDLE;
+
+ preempt_fold_need_resched();
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(play_idle);
+
void cpu_startup_entry(enum cpuhp_state state)
{
/*
@@ -299,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state)
#endif
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
- cpu_idle_loop();
+ while (1)
+ do_idle();
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 055f935d4421..7b34c7826ca5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -404,6 +404,7 @@ struct cfs_rq {
unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
+ unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
@@ -539,6 +540,11 @@ struct dl_rq {
#ifdef CONFIG_SMP
+static inline bool sched_asym_prefer(int a, int b)
+{
+ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
+}
+
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -623,6 +629,7 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
+ struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -892,7 +899,8 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity;
+ unsigned long capacity;
+ unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
@@ -905,6 +913,7 @@ struct sched_group {
unsigned int group_weight;
struct sched_group_capacity *sgc;
+ int asym_prefer_cpu; /* cpu of highest priority in group */
/*
* The CPUs this group covers.