diff options
-rw-r--r-- | arch/Kconfig | 3 | ||||
-rw-r--r-- | arch/ia64/kernel/time.c | 4 | ||||
-rw-r--r-- | arch/powerpc/Kconfig | 1 | ||||
-rw-r--r-- | arch/powerpc/include/asm/cputime.h | 14 | ||||
-rw-r--r-- | arch/powerpc/kernel/time.c | 8 | ||||
-rw-r--r-- | arch/s390/Kconfig | 1 | ||||
-rw-r--r-- | arch/s390/kernel/vtime.c | 9 | ||||
-rw-r--r-- | arch/x86/include/asm/preempt.h | 8 | ||||
-rw-r--r-- | arch/x86/kernel/apm_32.c | 4 | ||||
-rw-r--r-- | include/asm-generic/cputime_jiffies.h | 1 | ||||
-rw-r--r-- | include/asm-generic/cputime_nsecs.h | 1 | ||||
-rw-r--r-- | include/linux/kernel_stat.h | 4 | ||||
-rw-r--r-- | include/linux/sched.h | 87 | ||||
-rw-r--r-- | include/linux/sched/sysctl.h | 1 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 19 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 2 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 124 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 515 | ||||
-rw-r--r-- | kernel/sched/sched.h | 5 | ||||
-rw-r--r-- | kernel/sysctl.c | 7 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 4 |
23 files changed, 551 insertions, 275 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 659bdd079277..abab6590f08f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -512,6 +512,9 @@ config HAVE_CONTEXT_TRACKING config HAVE_VIRT_CPU_ACCOUNTING bool +config ARCH_HAS_SCALED_CPUTIME + bool + config HAVE_VIRT_CPU_ACCOUNTING_GEN bool default y if 64BIT diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 6f892b94e906..021f44ab4bfb 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -68,7 +68,7 @@ void vtime_account_user(struct task_struct *tsk) if (ti->ac_utime) { delta_utime = cycle_to_cputime(ti->ac_utime); - account_user_time(tsk, delta_utime, delta_utime); + account_user_time(tsk, delta_utime); ti->ac_utime = 0; } } @@ -112,7 +112,7 @@ void vtime_account_system(struct task_struct *tsk) { cputime_t delta = vtime_delta(tsk); - account_system_time(tsk, 0, delta, delta); + account_system_time(tsk, 0, delta); } EXPORT_SYMBOL_GPL(vtime_account_system); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 65fba4c34cd7..c7f120aaa98f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -160,6 +160,7 @@ config PPC select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS select GENERIC_CPU_AUTOPROBE select HAVE_VIRT_CPU_ACCOUNTING + select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select HAVE_ARCH_HARDENED_USERCOPY select HAVE_KERNEL_GZIP diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 4f60db074725..aa2e6a34b872 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -46,26 +46,12 @@ extern cputime_t cputime_one_jiffy; * Convert cputime <-> jiffies */ extern u64 __cputime_jiffies_factor; -DECLARE_PER_CPU(unsigned long, cputime_last_delta); -DECLARE_PER_CPU(unsigned long, cputime_scaled_last_delta); static inline unsigned long cputime_to_jiffies(const cputime_t ct) { return mulhdu((__force u64) ct, __cputime_jiffies_factor); } -/* Estimate the scaled cputime by scaling the real cputime based on - * the last scaled to real ratio */ -static inline cputime_t cputime_to_scaled(const cputime_t ct) -{ - if (cpu_has_feature(CPU_FTR_SPURR) && - __this_cpu_read(cputime_last_delta)) - return (__force u64) ct * - __this_cpu_read(cputime_scaled_last_delta) / - __this_cpu_read(cputime_last_delta); - return ct; -} - static inline cputime_t jiffies_to_cputime(const unsigned long jif) { u64 ct; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index bc3f7d0d7b79..be9751f1cb2a 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -164,8 +164,6 @@ u64 __cputime_sec_factor; EXPORT_SYMBOL(__cputime_sec_factor); u64 __cputime_clockt_factor; EXPORT_SYMBOL(__cputime_clockt_factor); -DEFINE_PER_CPU(unsigned long, cputime_last_delta); -DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta); cputime_t cputime_one_jiffy; @@ -360,7 +358,8 @@ void vtime_account_system(struct task_struct *tsk) unsigned long delta, sys_scaled, stolen; delta = vtime_delta(tsk, &sys_scaled, &stolen); - account_system_time(tsk, 0, delta, sys_scaled); + account_system_time(tsk, 0, delta); + tsk->stimescaled += sys_scaled; if (stolen) account_steal_time(stolen); } @@ -393,7 +392,8 @@ void vtime_account_user(struct task_struct *tsk) acct->user_time = 0; acct->user_time_scaled = 0; acct->utime_sspurr = 0; - account_user_time(tsk, utime, utimescaled); + account_user_time(tsk, utime); + tsk->utimescaled += utimescaled; } #ifdef CONFIG_PPC32 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 426481d4cc86..028f97be5bae 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -171,6 +171,7 @@ config S390 select SYSCTL_EXCEPTION_TRACE select TTY select VIRT_CPU_ACCOUNTING + select ARCH_HAS_SCALED_CPUTIME select VIRT_TO_BUS select HAVE_NMI diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 856e30d8463f..1bd5dde2d5a9 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -137,8 +137,10 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) user_scaled = (user_scaled * mult) / div; system_scaled = (system_scaled * mult) / div; } - account_user_time(tsk, user, user_scaled); - account_system_time(tsk, hardirq_offset, system, system_scaled); + account_user_time(tsk, user); + tsk->utimescaled += user_scaled; + account_system_time(tsk, hardirq_offset, system); + tsk->stimescaled += system_scaled; steal = S390_lowcore.steal_timer; if ((s64) steal > 0) { @@ -202,7 +204,8 @@ void vtime_account_irq_enter(struct task_struct *tsk) system_scaled = (system_scaled * mult) / div; } - account_system_time(tsk, 0, system, system_scaled); + account_system_time(tsk, 0, system); + tsk->stimescaled += system_scaled; virt_timer_forward(system); } diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 17f218645701..ec1f3c651150 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -24,7 +24,13 @@ static __always_inline int preempt_count(void) static __always_inline void preempt_count_set(int pc) { - raw_cpu_write_4(__preempt_count, pc); + int old, new; + + do { + old = raw_cpu_read_4(__preempt_count); + new = (old & PREEMPT_NEED_RESCHED) | + (pc & ~PREEMPT_NEED_RESCHED); + } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); } /* diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 51287cd90bf6..643818a7688b 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -906,14 +906,14 @@ static int apm_cpu_idle(struct cpuidle_device *dev, static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ static unsigned int last_stime; /* = 0 */ - cputime_t stime; + cputime_t stime, utime; int apm_idle_done = 0; unsigned int jiffies_since_last_check = jiffies - last_jiffies; unsigned int bucket; recalc: - task_cputime(current, NULL, &stime); + task_cputime(current, &utime, &stime); if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; } else if (jiffies_since_last_check > idle_period) { diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h index fe386fc6e85e..6bb8cd45f53b 100644 --- a/include/asm-generic/cputime_jiffies.h +++ b/include/asm-generic/cputime_jiffies.h @@ -7,7 +7,6 @@ typedef unsigned long __nocast cputime_t; #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) -#define cputime_to_scaled(__ct) (__ct) #define jiffies_to_cputime(__hz) (__force cputime_t)(__hz) typedef u64 __nocast cputime64_t; diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index a84e28e0c634..4e3b18e559b1 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -34,7 +34,6 @@ typedef u64 __nocast cputime64_t; */ #define cputime_to_jiffies(__ct) \ cputime_div(__ct, NSEC_PER_SEC / HZ) -#define cputime_to_scaled(__ct) (__ct) #define jiffies_to_cputime(__jif) \ (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) #define cputime64_to_jiffies64(__ct) \ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44fda64ad434..00f776816aa3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -78,8 +78,8 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) return kstat_cpu(cpu).irqs_sum; } -extern void account_user_time(struct task_struct *, cputime_t, cputime_t); -extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); +extern void account_user_time(struct task_struct *, cputime_t); +extern void account_system_time(struct task_struct *, int, cputime_t); extern void account_steal_time(cputime_t); extern void account_idle_time(cputime_t); diff --git a/include/linux/sched.h b/include/linux/sched.h index e9c009dc3a4a..19abba04ceca 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -262,20 +262,9 @@ extern char ___assert_task_state[1 - 2*!!( #define set_task_state(tsk, state_value) \ do { \ (tsk)->task_state_change = _THIS_IP_; \ - smp_store_mb((tsk)->state, (state_value)); \ + smp_store_mb((tsk)->state, (state_value)); \ } while (0) -/* - * set_current_state() includes a barrier so that the write of current->state - * is correctly serialised wrt the caller's subsequent test of whether to - * actually sleep: - * - * set_current_state(TASK_UNINTERRUPTIBLE); - * if (do_i_need_to_sleep()) - * schedule(); - * - * If the caller does not need such serialisation then use __set_current_state() - */ #define __set_current_state(state_value) \ do { \ current->task_state_change = _THIS_IP_; \ @@ -284,11 +273,19 @@ extern char ___assert_task_state[1 - 2*!!( #define set_current_state(state_value) \ do { \ current->task_state_change = _THIS_IP_; \ - smp_store_mb(current->state, (state_value)); \ + smp_store_mb(current->state, (state_value)); \ } while (0) #else +/* + * @tsk had better be current, or you get to keep the pieces. + * + * The only reason is that computing current can be more expensive than + * using a pointer that's already available. + * + * Therefore, see set_current_state(). + */ #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) #define set_task_state(tsk, state_value) \ @@ -299,11 +296,34 @@ extern char ___assert_task_state[1 - 2*!!( * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * + * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); - * if (do_i_need_to_sleep()) - * schedule(); + * if (!need_sleep) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * If the caller does not need such serialisation (because, for instance, the + * condition test and condition change and wakeup are under the same lock) then + * use __set_current_state(). + * + * The above is typically ordered against the wakeup, which does: + * + * need_sleep = false; + * wake_up_state(p, TASK_UNINTERRUPTIBLE); + * + * Where wake_up_state() (and all other wakeup primitives) imply enough + * barriers to order the store of the variable against wakeup. * - * If the caller does not need such serialisation then use __set_current_state() + * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, + * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a + * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). + * + * This is obviously fine, since they both store the exact same value. + * + * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ do { current->state = (state_value); } while (0) @@ -1627,7 +1647,10 @@ struct task_struct { int __user *set_child_tid; /* CLONE_CHILD_SETTID */ int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ - cputime_t utime, stime, utimescaled, stimescaled; + cputime_t utime, stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + cputime_t utimescaled, stimescaled; +#endif cputime_t gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -2220,34 +2243,38 @@ struct task_struct *try_get_task_struct(struct task_struct **ptask); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime); -extern void task_cputime_scaled(struct task_struct *t, - cputime_t *utimescaled, cputime_t *stimescaled); extern cputime_t task_gtime(struct task_struct *t); #else static inline void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { - if (utime) - *utime = t->utime; - if (stime) - *stime = t->stime; + *utime = t->utime; + *stime = t->stime; } +static inline cputime_t task_gtime(struct task_struct *t) +{ + return t->gtime; +} +#endif + +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME static inline void task_cputime_scaled(struct task_struct *t, cputime_t *utimescaled, cputime_t *stimescaled) { - if (utimescaled) - *utimescaled = t->utimescaled; - if (stimescaled) - *stimescaled = t->stimescaled; + *utimescaled = t->utimescaled; + *stimescaled = t->stimescaled; } - -static inline cputime_t task_gtime(struct task_struct *t) +#else +static inline void task_cputime_scaled(struct task_struct *t, + cputime_t *utimescaled, + cputime_t *stimescaled) { - return t->gtime; + task_cputime(t, utimescaled, stimescaled); } #endif + extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 22db1e63707e..441145351301 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -36,7 +36,6 @@ extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_sched_shares_window; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, diff --git a/kernel/fork.c b/kernel/fork.c index 997ac1d584f7..600e93b5e539 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1551,7 +1551,9 @@ static __latent_entropy struct task_struct *copy_process( init_sigpending(&p->pending); p->utime = p->stime = p->gtime = 0; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME p->utimescaled = p->stimescaled = 0; +#endif prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 154fd689fe02..dc64bd71ed2b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1995,14 +1995,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * - * Put it on the run-queue if it's not already there. The "current" - * thread is always on the run-queue (except when the actual - * re-schedule is in progress), and as such you're allowed to do - * the simpler "current->state = TASK_RUNNING" to mark yourself - * runnable without the overhead of this. + * If (@state & @p->state) @p->state = TASK_RUNNING. * - * Return: %true if @p was woken up, %false if it was already running. - * or @state didn't match @p's state. + * If the task was not queued/runnable, also place it back on a runqueue. + * + * Atomic against schedule() which would dequeue a task, also see + * set_current_state(). + * + * Return: %true if @p->state changes (an actual wakeup was done), + * %false otherwise. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) @@ -5707,7 +5708,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %*pbl", cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %d)", + printk(KERN_CONT " (cpu_capacity = %lu)", group->sgc->capacity); } @@ -6184,6 +6185,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the @@ -7602,6 +7604,7 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* * How much cpu bandwidth does root_task_group get? * diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index bc0b309c3f19..9add206b5608 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[stat], - cputime64_to_clock_t(val[stat])); + (long long)cputime64_to_clock_t(val[stat])); } return 0; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5ebee3164e64..7700a9cba335 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -128,16 +128,13 @@ static inline void task_group_account_field(struct task_struct *p, int index, * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) +void account_user_time(struct task_struct *p, cputime_t cputime) { int index; /* Add user time to process. */ p->utime += cputime; - p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; @@ -153,16 +150,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime, * Account guest cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) +static void account_guest_time(struct task_struct *p, cputime_t cputime) { u64 *cpustat = kcpustat_this_cpu->cpustat; /* Add guest time to process. */ p->utime += cputime; - p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); p->gtime += cputime; @@ -180,16 +174,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, * Account system cpu time to a process and desired cpustat field * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated + * @index: pointer to cpustat field that has to be updated */ static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, int index) +void __account_system_time(struct task_struct *p, cputime_t cputime, int index) { /* Add system time to process. */ p->stime += cputime; - p->stimescaled += cputime_scaled; account_group_system_time(p, cputime); /* Add system time to cpustat. */ @@ -204,15 +195,14 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) + cputime_t cputime) { int index; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime, cputime_scaled); + account_guest_time(p, cputime); return; } @@ -223,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else index = CPUTIME_SYSTEM; - __account_system_time(p, cputime, cputime_scaled, index); + __account_system_time(p, cputime, index); } /* @@ -390,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq, int ticks) { u64 cputime = (__force u64) cputime_one_jiffy * ticks; - cputime_t scaled, other; + cputime_t other; /* * When returning from idle, many ticks can get accounted at @@ -403,7 +393,6 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, if (other >= cputime) return; cputime -= other; - scaled = cputime_to_scaled(cputime); if (this_cpu_ksoftirqd() == p) { /* @@ -411,15 +400,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); + __account_system_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { - account_user_time(p, cputime, scaled); + account_user_time(p, cputime); } else if (p == rq->idle) { account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime, scaled); + account_guest_time(p, cputime); } else { - __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); + __account_system_time(p, cputime, CPUTIME_SYSTEM); } } @@ -502,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t cputime, scaled, steal; + cputime_t cputime, steal; struct rq *rq = this_rq(); if (vtime_accounting_cpu_enabled()) @@ -520,12 +509,11 @@ void account_process_tick(struct task_struct *p, int user_tick) return; cputime -= steal; - scaled = cputime_to_scaled(cputime); if (user_tick) - account_user_time(p, cputime, scaled); + account_user_time(p, cputime); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); + account_system_time(p, HARDIRQ_OFFSET, cputime); else account_idle_time(cputime); } @@ -746,7 +734,7 @@ static void __vtime_account_system(struct task_struct *tsk) { cputime_t delta_cpu = get_vtime_delta(tsk); - account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); + account_system_time(tsk, irq_count(), delta_cpu); } void vtime_account_system(struct task_struct *tsk) @@ -767,7 +755,7 @@ void vtime_account_user(struct task_struct *tsk) tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) { delta_cpu = get_vtime_delta(tsk); - account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + account_user_time(tsk, delta_cpu); } write_seqcount_end(&tsk->vtime_seqcount); } @@ -863,29 +851,25 @@ cputime_t task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -static void -fetch_task_cputime(struct task_struct *t, - cputime_t *u_dst, cputime_t *s_dst, - cputime_t *u_src, cputime_t *s_src, - cputime_t *udelta, cputime_t *sdelta) +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { + cputime_t delta; unsigned int seq; - unsigned long long delta; - do { - *udelta = 0; - *sdelta = 0; + if (!vtime_accounting_enabled()) { + *utime = t->utime; + *stime = t->stime; + return; + } + do { seq = read_seqcount_begin(&t->vtime_seqcount); - if (u_dst) - *u_dst = *u_src; - if (s_dst) - *s_dst = *s_src; + *utime = t->utime; + *stime = t->stime; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_INACTIVE || - is_idle_task(t)) + if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) continue; delta = vtime_delta(t); @@ -894,54 +878,10 @@ fetch_task_cputime(struct task_struct *t, * Task runs either in user or kernel space, add pending nohz time to * the right place. */ - if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { - *udelta = delta; - } else { - if (t->vtime_snap_whence == VTIME_SYS) - *sdelta = delta; - } + if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) + *utime += delta; + else if (t->vtime_snap_whence == VTIME_SYS) + *stime += delta; } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } - - -void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) -{ - cputime_t udelta, sdelta; - - if (!vtime_accounting_enabled()) { - if (utime) - *utime = t->utime; - if (stime) - *stime = t->stime; - return; - } - - fetch_task_cputime(t, utime, stime, &t->utime, - &t->stime, &udelta, &sdelta); - if (utime) - *utime += udelta; - if (stime) - *stime += sdelta; -} - -void task_cputime_scaled(struct task_struct *t, - cputime_t *utimescaled, cputime_t *stimescaled) -{ - cputime_t udelta, sdelta; - - if (!vtime_accounting_enabled()) { - if (utimescaled) - *utimescaled = t->utimescaled; - if (stimescaled) - *stimescaled = t->stimescaled; - return; - } - - fetch_task_cputime(t, utimescaled, stimescaled, - &t->utimescaled, &t->stimescaled, &udelta, &sdelta); - if (utimescaled) - *utimescaled += cputime_to_scaled(udelta); - if (stimescaled) - *stimescaled += cputime_to_scaled(sdelta); -} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 37e2449186c4..c61b461248a3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -586,7 +586,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) /* * The task might have changed its scheduling policy to something - * different than SCHED_DEADLINE (through switched_fromd_dl()). + * different than SCHED_DEADLINE (through switched_from_dl()). */ if (!dl_task(p)) { __dl_clear_params(p); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c242944f5cbd..02605f2826a2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -93,13 +93,6 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -/* - * The exponential sliding window over which load is averaged for shares - * distribution. - * (default: 10msec) - */ -unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; - #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -116,7 +109,7 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; /* * The margin used when comparing utilization with CPU capacity: - * util * 1024 < capacity * margin + * util * margin < capacity * 1024 */ unsigned int capacity_margin = 1280; /* ~20% */ @@ -290,19 +283,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the beg of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + /* + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. + */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + /* + * We have reach the beg of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else { + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the beg of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new beg + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; } cfs_rq->on_list = 1; @@ -708,9 +741,7 @@ void init_entity_runnable_average(struct sched_entity *se) } static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); +static void attach_entity_cfs_rq(struct sched_entity *se); /* * With new tasks being created, their initial util_avgs are extrapolated @@ -742,7 +773,6 @@ void post_init_entity_util_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; - u64 now = cfs_rq_clock_task(cfs_rq); if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -770,14 +800,12 @@ void post_init_entity_util_avg(struct sched_entity *se) * such that the next switched_to_fair() has the * expected state. */ - se->avg.last_update_time = now; + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); return; } } - update_cfs_rq_load_avg(now, cfs_rq, false); - attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + attach_entity_cfs_rq(se); } #else /* !CONFIG_SMP */ @@ -2890,6 +2918,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return decayed; } +/* + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + #ifdef CONFIG_FAIR_GROUP_SCHED /** * update_tg_load_avg - update the tg's load avg @@ -2969,8 +3017,138 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } } + +/* Take into account change of utilization of a child task group */ +static inline void +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's utilization */ + se->avg.util_avg = gcfs_rq->avg.util_avg; + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq utilization */ + add_positive(&cfs_rq->avg.util_avg, delta); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; +} + +/* Take into account change of load of a child task group */ +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta, load = gcfs_rq->avg.load_avg; + + /* + * If the load of group cfs_rq is null, the load of the + * sched_entity will also be null so we can skip the formula + */ + if (load) { + long tg_load; + + /* Get tg's load and ensure tg_load > 0 */ + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; + + /* Ensure tg_load >= load and updated with current load*/ + tg_load -= gcfs_rq->tg_load_avg_contrib; + tg_load += load; + + /* + * We need to compute a correction term in the case that the + * task group is consuming more CPU than a task of equal + * weight. A task with a weight equals to tg->shares will have + * a load less or equal to scale_load_down(tg->shares). + * Similarly, the sched_entities that represent the task group + * at parent level, can't have a load higher than + * scale_load_down(tg->shares). And the Sum of sched_entities' + * load must be <= scale_load_down(tg->shares). + */ + if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { + /* scale gcfs_rq's load into tg's shares*/ + load *= scale_load_down(gcfs_rq->tg->shares); + load /= tg_load; + } + } + + delta = load - se->avg.load_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's load */ + se->avg.load_avg = load; + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq load */ + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + + /* + * If the sched_entity is already enqueued, we also have to update the + * runnable load avg. + */ + if (se->on_rq) { + /* Update parent cfs_rq runnable_load_avg */ + add_positive(&cfs_rq->runnable_load_avg, delta); + cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + } +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +{ + cfs_rq->propagate_avg = 1; +} + +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + + if (!cfs_rq->propagate_avg) + return 0; + + cfs_rq->propagate_avg = 0; + return 1; +} + +/* Update task and its cfs_rq load average */ +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + if (entity_is_task(se)) + return 0; + + if (!test_and_clear_tg_cfs_propagate(se)) + return 0; + + cfs_rq = cfs_rq_of(se); + + set_tg_cfs_propagate(cfs_rq); + + update_tg_cfs_util(cfs_rq, se); + update_tg_cfs_load(cfs_rq, se); + + return 1; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ + static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} + +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + return 0; +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) @@ -3041,6 +3219,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1; + set_tg_cfs_propagate(cfs_rq); } if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -3048,6 +3227,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1; + set_tg_cfs_propagate(cfs_rq); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -3064,23 +3244,35 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) return decayed || removed_load; } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 + /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); struct rq *rq = rq_of(cfs_rq); int cpu = cpu_of(rq); + int decayed; /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - __update_load_avg(now, cpu, &se->avg, + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { + __update_load_avg(now, cpu, &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + } + + decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed |= propagate_entity_load_avg(se); - if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) + if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); } @@ -3094,31 +3286,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!sched_feat(ATTACH_AGE_LOAD)) - goto skip_aging; - - /* - * If we got migrated (either between CPUs or between cgroups) we'll - * have aged the average right before clearing @last_update_time. - * - * Or we're fresh through post_init_entity_util_avg(). - */ - if (se->avg.last_update_time) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); - - /* - * XXX: we could have just aged the entire load away if we've been - * absent from the fair class for too long. - */ - } - -skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -3133,14 +3306,12 @@ skip_aging: */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -3150,34 +3321,20 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; - u64 now = cfs_rq_clock_task(cfs_rq); - int migrated, decayed; - - migrated = !sa->last_update_time; - if (!migrated) { - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } - - decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) + if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); - - if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); + } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_load_avg(se, 1); - cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = @@ -3206,13 +3363,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) #endif /* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); +} + +/* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; /* * tasks cannot exit without having gone through wake_up_new_task() -> @@ -3224,9 +3393,7 @@ void remove_entity_load_avg(struct sched_entity *se) * calls this. */ - last_update_time = cfs_rq_last_update_time(cfs_rq); - - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -3251,7 +3418,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) return 0; } -static inline void update_load_avg(struct sched_entity *se, int not_used) +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 + +static inline void update_load_avg(struct sched_entity *se, int not_used1) { cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); } @@ -3396,6 +3566,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; + update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3470,6 +3641,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se, flags); @@ -3557,7 +3729,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -3675,7 +3847,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, 1); + update_load_avg(curr, UPDATE_TG); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -4572,7 +4744,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -4631,7 +4803,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -5199,6 +5371,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return 1; } +static inline int task_util(struct task_struct *p); +static int cpu_util_wake(int cpu, struct task_struct *p); + +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +{ + return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -5208,7 +5388,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; + struct sched_group *most_spare_sg = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -5216,7 +5398,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, spare_cap, max_spare_cap; int local_group; int i; @@ -5228,8 +5410,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group and find + * the group containing the CPU with most spare capacity. + */ avg_load = 0; + max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ @@ -5239,6 +5425,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; + + spare_cap = capacity_spare_wake(i, p); + + if (spare_cap > max_spare_cap) + max_spare_cap = spare_cap; } /* Adjust by relative CPU capacity of the group */ @@ -5246,12 +5437,33 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; + this_spare = max_spare_cap; + } else { + if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + + if (most_spare < max_spare_cap) { + most_spare = max_spare_cap; + most_spare_sg = group; + } } } while (group = group->next, group != sd->groups); + /* + * The cross-over point between using spare capacity or least load + * is too conservative for high utilization tasks on partially + * utilized systems if we require spare_capacity > task_util(p), + * so we allow for some task stuffing by using + * spare_capacity > task_util(p)/2. + */ + if (this_spare > task_util(p) / 2 && + imbalance*this_spare > 100*most_spare) + return NULL; + else if (most_spare > task_util(p) / 2) + return most_spare_sg; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; @@ -5590,6 +5802,24 @@ static inline int task_util(struct task_struct *p) } /* + * cpu_util_wake: Compute cpu utilization with any contributions from + * the waking task p removed. + */ +static int cpu_util_wake(int cpu, struct task_struct *p) +{ + unsigned long util, capacity; + + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + return cpu_util(cpu); + + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); + + return (util >= capacity) ? capacity : util; +} + +/* * Disable WAKE_AFFINE in the case where task @p doesn't fit in the * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. * @@ -5607,6 +5837,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) if (max_cap - min_cap < max_cap >> 3) return 0; + /* Bring task utilization in sync with prev_cpu */ + sync_entity_load_avg(&p->se); + return min_cap * 1024 < task_util(p) * capacity_margin; } @@ -6641,6 +6874,10 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); + + /* Propagate pending load changes to the parent */ + if (cfs_rq->tg->se[cpu]) + update_load_avg(cfs_rq->tg->se[cpu], 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6845,13 +7082,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->min_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, min_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6864,6 +7102,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } capacity = 0; + min_capacity = ULONG_MAX; if (child->flags & SD_OVERLAP) { /* @@ -6888,11 +7127,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + min_capacity = min(capacity, min_capacity); } } else { /* @@ -6902,12 +7142,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + min_capacity = min(sgc->min_capacity, min_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->min_capacity = min_capacity; } /* @@ -7002,6 +7246,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->min_capacity * capacity_margin < + ref->sgc->min_capacity * 1024; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -7105,6 +7360,20 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= busiest->avg_load) return false; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; + + /* + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + +asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -8687,32 +8956,45 @@ static inline bool vruntime_normalized(struct task_struct *p) return false; } -static void detach_task_cfs_rq(struct task_struct *p) +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Propagate the changes of the sched_entity across the tg tree to make it + * visible to the root + */ +static void propagate_entity_cfs_rq(struct sched_entity *se) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); + struct cfs_rq *cfs_rq; - if (!vruntime_normalized(p)) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; + /* Start to propagate at parent */ + se = se->parent; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_load_avg(se, UPDATE_TG); } +} +#else +static void propagate_entity_cfs_rq(struct sched_entity *se) { } +#endif + +static void detach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); /* Catch up with the cfs_rq and remove our load when we leave */ - update_cfs_rq_load_avg(now, cfs_rq, false); + update_load_avg(se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); } -static void attach_task_cfs_rq(struct task_struct *p) +static void attach_entity_cfs_rq(struct sched_entity *se) { - struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8722,10 +9004,36 @@ static void attach_task_cfs_rq(struct task_struct *p) se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - /* Synchronize task with its cfs_rq */ - update_cfs_rq_load_avg(now, cfs_rq, false); + /* Synchronize entity with its cfs_rq */ + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } + + detach_entity_cfs_rq(se); +} + +static void attach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -8779,6 +9087,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->propagate_avg = 0; +#endif atomic_long_set(&cfs_rq->removed_load_avg, 0); atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif @@ -8887,7 +9198,7 @@ void online_fair_sched_group(struct task_group *tg) se = tg->se[i]; raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); + attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 055f935d4421..d7e39317d688 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -404,6 +404,7 @@ struct cfs_rq { unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; + unsigned long propagate_avg; #endif atomic_long_t removed_load_avg, removed_util_avg; #ifndef CONFIG_64BIT @@ -623,6 +624,7 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -892,7 +894,8 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU. */ - unsigned int capacity; + unsigned long capacity; + unsigned long min_capacity; /* Min per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 706309f9ed84..739fb17371af 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -347,13 +347,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "sched_shares_window_ns", - .data = &sysctl_sched_shares_window, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 39008d78927a..e887ffc8eef3 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -133,9 +133,9 @@ static inline unsigned long long prof_ticks(struct task_struct *p) } static inline unsigned long long virt_ticks(struct task_struct *p) { - cputime_t utime; + cputime_t utime, stime; - task_cputime(p, &utime, NULL); + task_cputime(p, &utime, &stime); return cputime_to_expires(utime); } |