diff options
-rw-r--r-- | arch/powerpc/platforms/powernv/smp.c | 1 | ||||
-rw-r--r-- | include/linux/sched.h | 10 | ||||
-rw-r--r-- | include/linux/sched/mm.h | 2 | ||||
-rw-r--r-- | include/linux/sched/topology.h | 29 | ||||
-rw-r--r-- | include/linux/swait.h | 23 | ||||
-rw-r--r-- | kernel/cpu.c | 18 | ||||
-rw-r--r-- | kernel/exit.c | 25 | ||||
-rw-r--r-- | kernel/sched/core.c | 201 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 7 | ||||
-rw-r--r-- | kernel/sched/debug.c | 8 | ||||
-rw-r--r-- | kernel/sched/fair.c | 239 | ||||
-rw-r--r-- | kernel/sched/pelt.c | 24 | ||||
-rw-r--r-- | kernel/sched/rt.c | 12 | ||||
-rw-r--r-- | kernel/sched/sched.h | 18 | ||||
-rw-r--r-- | kernel/sched/topology.c | 31 |
15 files changed, 360 insertions, 288 deletions
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 13e251699346..b2ba3e95bda7 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void) /* Standard hot unplug procedure */ idle_task_exit(); - current->active_mm = NULL; /* for sanity */ cpu = smp_processor_id(); DBG("CPU%d offline\n", cpu); generic_set_cpu_dead(cpu); diff --git a/include/linux/sched.h b/include/linux/sched.h index 4418f5cb8324..d4ea4407cd6d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1715,7 +1715,15 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); }) #ifdef CONFIG_SMP -void scheduler_ipi(void); +static __always_inline void scheduler_ipi(void) +{ + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + preempt_fold_need_resched(); +} extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else static inline void scheduler_ipi(void) { } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index c49257a3b510..a132d875d351 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +void mmdrop(struct mm_struct *mm); + /* * This has to be called after a get_task_mm()/mmget_not_zero() * followed by taking the mmap_sem for writing before modifying the diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 95253ad792b0..fb11091129b3 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -11,21 +11,20 @@ */ #ifdef CONFIG_SMP -#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ -#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ -#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ -#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ -#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ -#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */ -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */ -#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ -#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */ -#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ -#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ -#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ -#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ -#define SD_NUMA 0x4000 /* cross-node balancing */ +#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */ +#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */ +#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */ +#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */ +#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */ +#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */ +#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */ +#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */ +#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */ +#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */ +#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */ +#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */ +#define SD_NUMA 0x2000 /* cross-node balancing */ #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) diff --git a/include/linux/swait.h b/include/linux/swait.h index 73e06e9986d4..6a8c22b8c2a5 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -9,23 +9,10 @@ #include <asm/current.h> /* - * BROKEN wait-queues. - * - * These "simple" wait-queues are broken garbage, and should never be - * used. The comments below claim that they are "similar" to regular - * wait-queues, but the semantics are actually completely different, and - * every single user we have ever had has been buggy (or pointless). - * - * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what - * "wake_up()" does, and has led to problems. In other cases, it has - * been fine, because there's only ever one waiter (kvm), but in that - * case gthe whole "simple" wait-queue is just pointless to begin with, - * since there is no "queue". Use "wake_up_process()" with a direct - * pointer instead. - * - * While these are very similar to regular wait queues (wait.h) the most - * important difference is that the simple waitqueue allows for deterministic - * behaviour -- IOW it has strictly bounded IRQ and lock hold times. + * Simple waitqueues are semantically very different to regular wait queues + * (wait.h). The most important difference is that the simple waitqueue allows + * for deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold + * times. * * Mainly, this is accomplished by two things. Firstly not allowing swake_up_all * from IRQ disabled, and dropping the lock upon every wakeup, giving a higher @@ -39,7 +26,7 @@ * sleeper state. * * - the !exclusive mode; because that leads to O(n) wakeups, everything is - * exclusive. + * exclusive. As such swake_up_one will only ever awake _one_ waiter. * * - custom wake callback functions; because you cannot give any guarantees * about random code. This also allows swait to be used in RT, such that diff --git a/kernel/cpu.c b/kernel/cpu.c index 2371292f30b0..244d30544377 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3,6 +3,7 @@ * * This code is licenced under the GPL. */ +#include <linux/sched/mm.h> #include <linux/proc_fs.h> #include <linux/smp.h> #include <linux/init.h> @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu) return bringup_wait_for_ap(cpu); } +static int finish_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + struct mm_struct *mm = idle->active_mm; + + /* + * idle_task_exit() will have switched to &init_mm, now + * clean up any remaining active_mm state. + */ + if (mm != &init_mm) + idle->active_mm = &init_mm; + mmdrop(mm); + return 0; +} + /* * Hotplug state machine related functions */ @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = bringup_cpu, - .teardown.single = NULL, + .teardown.single = finish_cpu, .cant_stop = true, }, /* Final state before CPU kills itself */ diff --git a/kernel/exit.c b/kernel/exit.c index ce2a75bc0ade..d56fe51bdf07 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -708,8 +708,12 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - profile_task_exit(tsk); - kcov_task_exit(tsk); + /* + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. + */ WARN_ON(blk_needs_flush_plug(tsk)); @@ -727,6 +731,16 @@ void __noreturn do_exit(long code) */ set_fs(USER_DS); + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } + + profile_task_exit(tsk); + kcov_task_exit(tsk); + ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); @@ -744,13 +758,6 @@ void __noreturn do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } - /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a2fbf98fd6f..6130ab170e93 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -219,6 +219,13 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } +static inline void +rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) +{ + csd->flags = 0; + csd->func = func; + csd->info = rq; +} #ifdef CONFIG_SCHED_HRTICK /* @@ -314,16 +321,14 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED_HARD); } + #endif /* CONFIG_SMP */ static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; + rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; } @@ -650,6 +655,16 @@ static inline bool got_nohz_idle_kick(void) return false; } +static void nohz_csd_func(void *info) +{ + struct rq *rq = info; + + if (got_nohz_idle_kick()) { + rq->idle_balance = 1; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } +} + #else /* CONFIG_NO_HZ_COMMON */ static inline bool got_nohz_idle_kick(void) @@ -2292,53 +2307,26 @@ void sched_ttwu_pending(void) rq_unlock_irqrestore(rq, &rf); } -void scheduler_ipi(void) +static void wake_csd_func(void *info) { - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); - - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) - return; - - /* - * Not all reschedule IPI handlers call irq_enter/irq_exit, since - * traditionally all their work was done from the interrupt return - * path. Now that we actually do some work, we need to make sure - * we do call them. - * - * Some archs already do call them, luckily irq_enter/exit nest - * properly. - * - * Arguably we should visit all archs and update all handlers, - * however a fair share of IPIs are still resched only so this would - * somewhat pessimize the simple resched case. - */ - irq_enter(); sched_ttwu_pending(); - - /* - * Check if someone kicked us for doing the nohz idle load balance. - */ - if (unlikely(got_nohz_idle_kick())) { - this_rq()->idle_balance = 1; - raise_softirq_irqoff(SCHED_SOFTIRQ); - } - irq_exit(); } -static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +/* + * Queue a task on the target CPUs wake_list and wake the CPU via IPI if + * necessary. The wakee CPU on receipt of the IPI will queue the task + * via sched_ttwu_wakeup() for activation so the wakee incurs the cost + * of the wakeup instead of the waker. + */ +static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (llist_add(&p->wake_entry, &rq->wake_list)) { if (!set_nr_if_polling(rq->idle)) - smp_send_reschedule(cpu); + smp_call_function_single_async(cpu, &rq->wake_csd); else trace_sched_wake_idle_without_ipi(cpu); } @@ -2372,6 +2360,38 @@ bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } + +static inline bool ttwu_queue_cond(int cpu, int wake_flags) +{ + /* + * If the CPU does not share cache, then queue the task on the + * remote rqs wakelist to avoid accessing remote data. + */ + if (!cpus_share_cache(smp_processor_id(), cpu)) + return true; + + /* + * If the task is descheduling and the only running task on the + * CPU then use the wakelist to offload the task activation to + * the soon-to-be-idle CPU as the current CPU is likely busy. + * nr_running is checked to avoid unnecessary task stacking. + */ + if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1) + return true; + + return false; +} + +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ + __ttwu_queue_wakelist(p, cpu, wake_flags); + return true; + } + + return false; +} #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2380,11 +2400,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq_flags rf; #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { - sched_clock_cpu(cpu); /* Sync clocks across CPUs */ - ttwu_queue_remote(p, cpu, wake_flags); + if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; - } #endif rq_lock(rq, &rf); @@ -2566,7 +2583,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->on_rq && ttwu_remote(p, wake_flags)) goto unlock; + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + #ifdef CONFIG_SMP + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2590,6 +2615,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, considering queueing p on the remote CPUs wake_list + * which potentially sends an IPI instead of spinning on p->on_cpu to + * let the waker make forward progress. This is safe because IRQs are + * disabled and the IPI will deliver after on_cpu is cleared. + */ + if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ)) + goto unlock; + + /* + * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. * * Pairs with the smp_store_release() in finish_task(). @@ -2599,28 +2634,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } - -#else /* CONFIG_SMP */ - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -3899,6 +3918,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) schedstat_inc(this_rq()->sched_count); } +static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + const struct sched_class *class; + /* + * We must do the balancing pass before put_prev_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. + */ + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); +} + /* * Pick up the highest-prio task: */ @@ -3932,22 +3973,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } restart: -#ifdef CONFIG_SMP - /* - * We must do the balancing pass before put_next_task(), such - * that when we release the rq->lock the task is in the same - * state as before we took rq->lock. - * - * We can terminate the balance pass as soon as we know there is - * a runnable task of @class priority or higher. - */ - for_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) - break; - } -#endif - - put_prev_task(rq, prev); + put_prev_task_balance(rq, prev, rf); for_each_class(class) { p = class->pick_next_task(rq); @@ -6190,13 +6216,14 @@ void idle_task_exit(void) struct mm_struct *mm = current->active_mm; BUG_ON(cpu_online(smp_processor_id())); + BUG_ON(current != this_rq()->idle); if (mm != &init_mm) { switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; finish_arch_post_lock_switch(); } - mmdrop(mm); + + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } /* @@ -6589,6 +6616,8 @@ void __init sched_init(void) root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -6641,7 +6670,6 @@ void __init sched_init(void) init_rt_rq(&rq->rt); init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* @@ -6663,7 +6691,6 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -6685,12 +6712,16 @@ void __init sched_init(void) rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; + rq_csd_init(rq, &rq->wake_csd, wake_csd_func); + INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); + + rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); #endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); @@ -7385,6 +7416,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -7413,6 +7446,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) return -EINVAL; /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota != RUNTIME_INF && quota > max_cfs_runtime) + return -EINVAL; + + /* * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9fbb10383434..941c28cf9738 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -5,6 +5,7 @@ * Based on the work by Paul Menage ([email protected]) and Balbir Singh * ([email protected]). */ +#include <asm/irq_regs.h> #include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ @@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = task_pt_regs(tsk); + struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); if (regs && user_mode(regs)) index = CPUACCT_STAT_USER; @@ -347,7 +348,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_lock(); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; + __this_cpu_add(ca->cpuusage->usages[index], cputime); rcu_read_unlock(); } @@ -363,7 +364,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) rcu_read_lock(); for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpustat)->cpustat[index] += val; + __this_cpu_add(ca->cpustat->cpustat[index], val); rcu_read_unlock(); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 239970b991c0..1c24a6bbdae2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[8] is terminator */ @@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), @@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" + SEQ_printf(m, " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n"); SEQ_printf(m, "-------------------------------------------------------" - "----------------------------------------------------\n"); + "------------------------------------------------------\n"); rcu_read_lock(); for_each_process_thread(g, p) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da3e5b54715b..dda9b194d225 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -191,7 +191,7 @@ static void update_sysctl(void) #undef SET_SYSCTL } -void sched_init_granularity(void) +void __init sched_init_granularity(void) { update_sysctl(); } @@ -1094,7 +1094,7 @@ struct numa_group { * more by CPU use than by memory faults. */ unsigned long *faults_cpu; - unsigned long faults[0]; + unsigned long faults[]; }; /* @@ -3441,52 +3441,46 @@ static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* Nothing to update */ if (!delta) return; - /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. - */ - /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + se->avg.util_sum = se->avg.util_avg * divider; /* Update parent cfs_rq utilization */ add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* Nothing to update */ if (!delta) return; - /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. - */ - /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX; + se->avg.runnable_sum = se->avg.runnable_avg * divider; /* Update parent cfs_rq runnable */ add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX; + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; } static inline void @@ -3496,19 +3490,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq unsigned long load_avg; u64 load_sum = 0; s64 delta_sum; + u32 divider; if (!runnable_sum) return; gcfs_rq->prop_runnable_sum = 0; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + if (runnable_sum >= 0) { /* * Add runnable; clip at LOAD_AVG_MAX. Reflects that until * the CPU is saturated running == runnable. */ runnable_sum += se->avg.load_sum; - runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); + runnable_sum = min_t(long, runnable_sum, divider); } else { /* * Estimate the new unweighted runnable_sum of the gcfs_rq by @@ -3533,7 +3534,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq runnable_sum = max(runnable_sum, running_sum); load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, LOAD_AVG_MAX); + load_avg = div_s64(load_sum, divider); delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; delta_avg = load_avg - se->avg.load_avg; @@ -3697,6 +3698,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* @@ -3873,6 +3878,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -4054,7 +4061,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq, struct rq_flags *rf) +static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -4588,16 +4595,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) } /* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, + struct cfs_rq *cfs_rq, u64 target_runtime) { - struct task_group *tg = cfs_rq->tg; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount; + u64 min_amount, amount = 0; + + lockdep_assert_held(&cfs_b->lock); /* note: this is a positive sum as runtime_remaining <= 0 */ - min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + min_amount = target_runtime - cfs_rq->runtime_remaining; - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { @@ -4609,13 +4616,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; return cfs_rq->runtime_remaining > 0; } +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + int ret; + + raw_spin_lock(&cfs_b->lock); + ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); + raw_spin_unlock(&cfs_b->lock); + + return ret; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ @@ -4704,13 +4723,33 @@ static int tg_throttle_down(struct task_group *tg, void *data) return 0; } -static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; - bool empty; + + raw_spin_lock(&cfs_b->lock); + /* This will start the period timer if necessary */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { + /* + * We have raced with bandwidth becoming available, and if we + * actually throttled the timer might not unthrottle us for an + * entire period. We additionally needed to make sure that any + * subsequent check_cfs_rq_runtime calls agree not to throttle + * us, as we may commit to do cfs put_prev+pick_next, so we ask + * for 1ns of runtime rather than just check cfs_b. + */ + dequeue = 0; + } else { + list_add_tail_rcu(&cfs_rq->throttled_list, + &cfs_b->throttled_cfs_rq); + } + raw_spin_unlock(&cfs_b->lock); + + if (!dequeue) + return false; /* Throttle no longer required. */ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4744,29 +4783,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!se) sub_nr_running(rq, task_delta); - cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); - raw_spin_lock(&cfs_b->lock); - empty = list_empty(&cfs_b->throttled_cfs_rq); - - /* - * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is - * not running add to the tail so that later runqueues don't get starved. - */ - if (cfs_b->distribute_running) - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - else - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - /* - * If we're the first throttled task, make sure the bandwidth - * timer is running. + * Note: distribution will already see us throttled via the + * throttled-list. rq->lock protects completion. */ - if (empty) - start_cfs_bandwidth(cfs_b); - - raw_spin_unlock(&cfs_b->lock); + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + return true; } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -4933,14 +4956,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u /* * This check is repeated as we release cfs_b->lock while we unthrottle. */ - while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - cfs_b->distribute_running = 1; + while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); } @@ -5054,10 +5075,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->slack_started = false; - if (cfs_b->distribute_running) { - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); - return; - } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); @@ -5067,9 +5084,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - if (runtime) - cfs_b->distribute_running = 1; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) @@ -5078,7 +5092,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -5139,8 +5152,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_rq_throttled(cfs_rq)) return true; - throttle_cfs_rq(cfs_rq); - return true; + return throttle_cfs_rq(cfs_rq); } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -5170,6 +5182,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); + if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -5199,8 +5213,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) /* reset count so we don't come right back in here */ count = 0; } - - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; @@ -5221,7 +5233,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; - cfs_b->distribute_running = 0; cfs_b->slack_started = false; } @@ -5506,28 +5517,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) list_add_leaf_cfs_rq(cfs_rq); } -enqueue_throttle: - if (!se) { - add_nr_running(rq, 1); - /* - * Since new tasks are assigned an initial util_avg equal to - * half of the spare capacity of their CPU, tiny tasks have the - * ability to cross the overutilized threshold, which will - * result in the load balancer ruining all the task placement - * done by EAS. As a way to mitigate that effect, do not account - * for the first enqueue operation of new tasks during the - * overutilized flag detection. - * - * A better way of solving this problem would be to wait for - * the PELT signals of tasks to converge before taking them - * into account, but that is not straightforward to implement, - * and the following generally works well enough in practice. - */ - if (flags & ENQUEUE_WAKEUP) - update_overutilized_status(rq); + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, 1); - } + /* + * Since new tasks are assigned an initial util_avg equal to + * half of the spare capacity of their CPU, tiny tasks have the + * ability to cross the overutilized threshold, which will + * result in the load balancer ruining all the task placement + * done by EAS. As a way to mitigate that effect, do not account + * for the first enqueue operation of new tasks during the + * overutilized flag detection. + * + * A better way of solving this problem would be to wait for + * the PELT signals of tasks to converge before taking them + * into account, but that is not straightforward to implement, + * and the following generally works well enough in practice. + */ + if (flags & ENQUEUE_WAKEUP) + update_overutilized_status(rq); +enqueue_throttle: if (cfs_bandwidth_used()) { /* * When bandwidth control is enabled; the cfs_rq_throttled() @@ -5737,7 +5747,7 @@ static int wake_wide(struct task_struct *p) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int factor = __this_cpu_read(sd_llc_size); if (master < slave) swap(master, slave); @@ -5846,8 +5856,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag); +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5930,7 +5939,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu); if (!group) { sd = sd->child; continue; @@ -6671,9 +6680,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f rcu_read_lock(); for_each_domain(cpu, tmp) { - if (!(tmp->flags & SD_LOAD_BALANCE)) - break; - /* * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. @@ -8702,8 +8708,7 @@ static bool update_pick_idlest(struct sched_group *idlest, * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; @@ -9434,7 +9439,7 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - int cpu, balance_cpu = -1; + int cpu; /* * Ensure the balancing environment is consistent; can happen @@ -9455,18 +9460,12 @@ static int should_we_balance(struct lb_env *env) if (!idle_cpu(cpu)) continue; - balance_cpu = cpu; - break; + /* Are we the first idle CPU? */ + return cpu == env->dst_cpu; } - if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); - - /* - * First idle CPU or the first CPU(busiest) in this sched group - * is eligible for doing load balancing at this and above domains. - */ - return balance_cpu == env->dst_cpu; + /* Are we the first CPU of this group ? */ + return group_balance_cpu(sg) == env->dst_cpu; } /* @@ -9819,9 +9818,8 @@ static int active_load_balance_cpu_stop(void *data) /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; } if (likely(sd)) { @@ -9910,9 +9908,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) } max_cost += sd->max_newidle_lb_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more @@ -10034,12 +10029,11 @@ static void kick_ilb(unsigned int flags) return; /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target CPU which + * This way we generate an IPI on the target CPU which * is idle. And the softirq performing nohz idle load balance * will be run before returning from the IPI. */ - smp_send_reschedule(ilb_cpu); + smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); } /* @@ -10450,7 +10444,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -10501,9 +10495,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int continue_balancing = 1; u64 t0, domain_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); break; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index b647d04d9c8b..b4b1ff96642f 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -237,6 +237,30 @@ ___update_load_sum(u64 now, struct sched_avg *sa, return 1; } +/* + * When syncing *_avg with *_sum, we must take into account the current + * position in the PELT segment otherwise the remaining part of the segment + * will be considered as idle time whereas it's not yet elapsed and this will + * generate unwanted oscillation in the range [1002..1024[. + * + * The max value of *_sum varies with the position in the time segment and is + * equals to : + * + * LOAD_AVG_MAX*y + sa->period_contrib + * + * which can be simplified into: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024 + * + * The same care must be taken when a sched entity is added, updated or + * removed from a cfs_rq and we need to update sched_avg. Scheduler entities + * and the cfs rq, to which they are attached, have the same position in the + * time segment because they use the same clock. This means that we can use + * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity + * if it's more convenient. + */ static __always_inline void ___update_load_avg(struct sched_avg *sa, unsigned long load) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index df11d88c9895..6d60ba21ed29 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -9,6 +9,8 @@ int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +/* More than 4 hours if BW_SHIFT equals 20. */ +static const u64 max_rt_runtime = MAX_BW; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg, if (rt_period == 0) return -EINVAL; + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); err = __rt_schedulable(tg, rt_period, rt_runtime); if (err) @@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void) return -EINVAL; if ((sysctl_sched_rt_runtime != RUNTIME_INF) && - (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) + ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || + ((u64)sysctl_sched_rt_runtime * + NSEC_PER_USEC > max_rt_runtime))) return -EINVAL; return 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index db3a57675ccf..4b32cff0dcbe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -349,7 +349,6 @@ struct cfs_bandwidth { u8 idle; u8 period_active; - u8 distribute_running; u8 slack_started; struct hrtimer period_timer; struct hrtimer slack_timer; @@ -890,9 +889,10 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; + call_single_data_t nohz_csd; #endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; - atomic_t nohz_flags; + atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ unsigned long nr_load_updates; @@ -979,7 +979,7 @@ struct rq { /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1021,6 +1021,7 @@ struct rq { #endif #ifdef CONFIG_SMP + call_single_data_t wake_csd; struct llist_head wake_list; #endif @@ -1461,7 +1462,7 @@ struct sched_group { * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) */ - unsigned long cpumask[0]; + unsigned long cpumask[]; }; static inline struct cpumask *sched_group_span(struct sched_group *sg) @@ -1504,14 +1505,10 @@ static inline void unregister_sched_domain_sysctl(void) } #endif -extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); - #else static inline void sched_ttwu_pending(void) { } -static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } - #endif /* CONFIG_SMP */ #include "stats.h" @@ -1688,7 +1685,8 @@ static inline int task_on_rq_migrating(struct task_struct *p) */ #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* Child wakeup after fork */ -#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ +#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ +#define WF_ON_RQ 0x08 /* Wakee is on_rq */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1918,6 +1916,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 +#define MAX_BW_BITS (64 - BW_SHIFT) +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1) unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..1d7b446fac7d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); - return -1; - } - printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); @@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | + if (sd->flags & (SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | @@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + pflags &= ~(SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | + SD_SHARE_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -1351,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl, .cache_nice_tries = 0, - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE + .flags = 1*SD_BALANCE_NEWIDLE | 1*SD_BALANCE_EXEC | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE |