diff options
-rw-r--r-- | arch/powerpc/platforms/powernv/smp.c | 1 | ||||
-rw-r--r-- | include/linux/sched.h | 10 | ||||
-rw-r--r-- | include/linux/sched/mm.h | 2 | ||||
-rw-r--r-- | include/linux/sched/topology.h | 29 | ||||
-rw-r--r-- | include/linux/swait.h | 23 | ||||
-rw-r--r-- | kernel/cpu.c | 18 | ||||
-rw-r--r-- | kernel/exit.c | 25 | ||||
-rw-r--r-- | kernel/sched/core.c | 114 | ||||
-rw-r--r-- | kernel/sched/debug.c | 8 | ||||
-rw-r--r-- | kernel/sched/fair.c | 149 | ||||
-rw-r--r-- | kernel/sched/sched.h | 11 | ||||
-rw-r--r-- | kernel/sched/topology.c | 31 |
12 files changed, 203 insertions, 218 deletions
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 13e251699346..b2ba3e95bda7 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void) /* Standard hot unplug procedure */ idle_task_exit(); - current->active_mm = NULL; /* for sanity */ cpu = smp_processor_id(); DBG("CPU%d offline\n", cpu); generic_set_cpu_dead(cpu); diff --git a/include/linux/sched.h b/include/linux/sched.h index 4418f5cb8324..d4ea4407cd6d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1715,7 +1715,15 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); }) #ifdef CONFIG_SMP -void scheduler_ipi(void); +static __always_inline void scheduler_ipi(void) +{ + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + preempt_fold_need_resched(); +} extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else static inline void scheduler_ipi(void) { } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index c49257a3b510..a132d875d351 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +void mmdrop(struct mm_struct *mm); + /* * This has to be called after a get_task_mm()/mmget_not_zero() * followed by taking the mmap_sem for writing before modifying the diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 95253ad792b0..fb11091129b3 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -11,21 +11,20 @@ */ #ifdef CONFIG_SMP -#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ -#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ -#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ -#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ -#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ -#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */ -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */ -#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ -#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */ -#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ -#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ -#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ -#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ -#define SD_NUMA 0x4000 /* cross-node balancing */ +#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */ +#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */ +#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */ +#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */ +#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */ +#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */ +#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */ +#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */ +#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */ +#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */ +#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */ +#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */ +#define SD_NUMA 0x2000 /* cross-node balancing */ #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) diff --git a/include/linux/swait.h b/include/linux/swait.h index 73e06e9986d4..6a8c22b8c2a5 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -9,23 +9,10 @@ #include <asm/current.h> /* - * BROKEN wait-queues. - * - * These "simple" wait-queues are broken garbage, and should never be - * used. The comments below claim that they are "similar" to regular - * wait-queues, but the semantics are actually completely different, and - * every single user we have ever had has been buggy (or pointless). - * - * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what - * "wake_up()" does, and has led to problems. In other cases, it has - * been fine, because there's only ever one waiter (kvm), but in that - * case gthe whole "simple" wait-queue is just pointless to begin with, - * since there is no "queue". Use "wake_up_process()" with a direct - * pointer instead. - * - * While these are very similar to regular wait queues (wait.h) the most - * important difference is that the simple waitqueue allows for deterministic - * behaviour -- IOW it has strictly bounded IRQ and lock hold times. + * Simple waitqueues are semantically very different to regular wait queues + * (wait.h). The most important difference is that the simple waitqueue allows + * for deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold + * times. * * Mainly, this is accomplished by two things. Firstly not allowing swake_up_all * from IRQ disabled, and dropping the lock upon every wakeup, giving a higher @@ -39,7 +26,7 @@ * sleeper state. * * - the !exclusive mode; because that leads to O(n) wakeups, everything is - * exclusive. + * exclusive. As such swake_up_one will only ever awake _one_ waiter. * * - custom wake callback functions; because you cannot give any guarantees * about random code. This also allows swait to be used in RT, such that diff --git a/kernel/cpu.c b/kernel/cpu.c index 2371292f30b0..244d30544377 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3,6 +3,7 @@ * * This code is licenced under the GPL. */ +#include <linux/sched/mm.h> #include <linux/proc_fs.h> #include <linux/smp.h> #include <linux/init.h> @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu) return bringup_wait_for_ap(cpu); } +static int finish_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + struct mm_struct *mm = idle->active_mm; + + /* + * idle_task_exit() will have switched to &init_mm, now + * clean up any remaining active_mm state. + */ + if (mm != &init_mm) + idle->active_mm = &init_mm; + mmdrop(mm); + return 0; +} + /* * Hotplug state machine related functions */ @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = bringup_cpu, - .teardown.single = NULL, + .teardown.single = finish_cpu, .cant_stop = true, }, /* Final state before CPU kills itself */ diff --git a/kernel/exit.c b/kernel/exit.c index ce2a75bc0ade..d56fe51bdf07 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -708,8 +708,12 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - profile_task_exit(tsk); - kcov_task_exit(tsk); + /* + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. + */ WARN_ON(blk_needs_flush_plug(tsk)); @@ -727,6 +731,16 @@ void __noreturn do_exit(long code) */ set_fs(USER_DS); + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } + + profile_task_exit(tsk); + kcov_task_exit(tsk); + ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); @@ -744,13 +758,6 @@ void __noreturn do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } - /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a2fbf98fd6f..74fb89b5ce3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -219,6 +219,13 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } +static inline void +rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) +{ + csd->flags = 0; + csd->func = func; + csd->info = rq; +} #ifdef CONFIG_SCHED_HRTICK /* @@ -314,16 +321,14 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED_HARD); } + #endif /* CONFIG_SMP */ static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; + rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; } @@ -650,6 +655,16 @@ static inline bool got_nohz_idle_kick(void) return false; } +static void nohz_csd_func(void *info) +{ + struct rq *rq = info; + + if (got_nohz_idle_kick()) { + rq->idle_balance = 1; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } +} + #else /* CONFIG_NO_HZ_COMMON */ static inline bool got_nohz_idle_kick(void) @@ -2292,42 +2307,9 @@ void sched_ttwu_pending(void) rq_unlock_irqrestore(rq, &rf); } -void scheduler_ipi(void) +static void wake_csd_func(void *info) { - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); - - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) - return; - - /* - * Not all reschedule IPI handlers call irq_enter/irq_exit, since - * traditionally all their work was done from the interrupt return - * path. Now that we actually do some work, we need to make sure - * we do call them. - * - * Some archs already do call them, luckily irq_enter/exit nest - * properly. - * - * Arguably we should visit all archs and update all handlers, - * however a fair share of IPIs are still resched only so this would - * somewhat pessimize the simple resched case. - */ - irq_enter(); sched_ttwu_pending(); - - /* - * Check if someone kicked us for doing the nohz idle load balance. - */ - if (unlikely(got_nohz_idle_kick())) { - this_rq()->idle_balance = 1; - raise_softirq_irqoff(SCHED_SOFTIRQ); - } - irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) @@ -2336,9 +2318,9 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (llist_add(&p->wake_entry, &rq->wake_list)) { if (!set_nr_if_polling(rq->idle)) - smp_send_reschedule(cpu); + smp_call_function_single_async(cpu, &rq->wake_csd); else trace_sched_wake_idle_without_ipi(cpu); } @@ -3899,6 +3881,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) schedstat_inc(this_rq()->sched_count); } +static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + const struct sched_class *class; + /* + * We must do the balancing pass before put_prev_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. + */ + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); +} + /* * Pick up the highest-prio task: */ @@ -3932,22 +3936,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } restart: -#ifdef CONFIG_SMP - /* - * We must do the balancing pass before put_next_task(), such - * that when we release the rq->lock the task is in the same - * state as before we took rq->lock. - * - * We can terminate the balance pass as soon as we know there is - * a runnable task of @class priority or higher. - */ - for_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) - break; - } -#endif - - put_prev_task(rq, prev); + put_prev_task_balance(rq, prev, rf); for_each_class(class) { p = class->pick_next_task(rq); @@ -6190,13 +6179,14 @@ void idle_task_exit(void) struct mm_struct *mm = current->active_mm; BUG_ON(cpu_online(smp_processor_id())); + BUG_ON(current != this_rq()->idle); if (mm != &init_mm) { switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; finish_arch_post_lock_switch(); } - mmdrop(mm); + + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } /* @@ -6589,6 +6579,8 @@ void __init sched_init(void) root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -6641,7 +6633,6 @@ void __init sched_init(void) init_rt_rq(&rq->rt); init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* @@ -6663,7 +6654,6 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -6685,12 +6675,16 @@ void __init sched_init(void) rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; + rq_csd_init(rq, &rq->wake_csd, wake_csd_func); + INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); + + rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); #endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 239970b991c0..1c24a6bbdae2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[8] is terminator */ @@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), @@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" + SEQ_printf(m, " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n"); SEQ_printf(m, "-------------------------------------------------------" - "----------------------------------------------------\n"); + "------------------------------------------------------\n"); rcu_read_lock(); for_each_process_thread(g, p) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 538ba5d94e99..9a58874ef104 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -191,7 +191,7 @@ static void update_sysctl(void) #undef SET_SYSCTL } -void sched_init_granularity(void) +void __init sched_init_granularity(void) { update_sysctl(); } @@ -3873,6 +3873,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -4054,7 +4056,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq, struct rq_flags *rf) +static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -4588,16 +4590,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) } /* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, + struct cfs_rq *cfs_rq, u64 target_runtime) { - struct task_group *tg = cfs_rq->tg; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount; + u64 min_amount, amount = 0; + + lockdep_assert_held(&cfs_b->lock); /* note: this is a positive sum as runtime_remaining <= 0 */ - min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + min_amount = target_runtime - cfs_rq->runtime_remaining; - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { @@ -4609,13 +4611,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; return cfs_rq->runtime_remaining > 0; } +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + int ret; + + raw_spin_lock(&cfs_b->lock); + ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); + raw_spin_unlock(&cfs_b->lock); + + return ret; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ @@ -4704,13 +4718,33 @@ static int tg_throttle_down(struct task_group *tg, void *data) return 0; } -static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; - bool empty; + + raw_spin_lock(&cfs_b->lock); + /* This will start the period timer if necessary */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { + /* + * We have raced with bandwidth becoming available, and if we + * actually throttled the timer might not unthrottle us for an + * entire period. We additionally needed to make sure that any + * subsequent check_cfs_rq_runtime calls agree not to throttle + * us, as we may commit to do cfs put_prev+pick_next, so we ask + * for 1ns of runtime rather than just check cfs_b. + */ + dequeue = 0; + } else { + list_add_tail_rcu(&cfs_rq->throttled_list, + &cfs_b->throttled_cfs_rq); + } + raw_spin_unlock(&cfs_b->lock); + + if (!dequeue) + return false; /* Throttle no longer required. */ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4744,29 +4778,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!se) sub_nr_running(rq, task_delta); - cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); - raw_spin_lock(&cfs_b->lock); - empty = list_empty(&cfs_b->throttled_cfs_rq); - - /* - * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is - * not running add to the tail so that later runqueues don't get starved. - */ - if (cfs_b->distribute_running) - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - else - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - /* - * If we're the first throttled task, make sure the bandwidth - * timer is running. + * Note: distribution will already see us throttled via the + * throttled-list. rq->lock protects completion. */ - if (empty) - start_cfs_bandwidth(cfs_b); - - raw_spin_unlock(&cfs_b->lock); + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + return true; } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -4933,14 +4951,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u /* * This check is repeated as we release cfs_b->lock while we unthrottle. */ - while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - cfs_b->distribute_running = 1; + while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); } @@ -5054,10 +5070,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->slack_started = false; - if (cfs_b->distribute_running) { - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); - return; - } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); @@ -5067,9 +5079,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - if (runtime) - cfs_b->distribute_running = 1; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) @@ -5078,7 +5087,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -5139,8 +5147,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_rq_throttled(cfs_rq)) return true; - throttle_cfs_rq(cfs_rq); - return true; + return throttle_cfs_rq(cfs_rq); } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -5170,6 +5177,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); + if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -5199,8 +5208,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) /* reset count so we don't come right back in here */ count = 0; } - - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; @@ -5221,7 +5228,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; - cfs_b->distribute_running = 0; cfs_b->slack_started = false; } @@ -5737,7 +5743,7 @@ static int wake_wide(struct task_struct *p) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int factor = __this_cpu_read(sd_llc_size); if (master < slave) swap(master, slave); @@ -5846,8 +5852,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag); +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5930,7 +5935,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu); if (!group) { sd = sd->child; continue; @@ -6671,9 +6676,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f rcu_read_lock(); for_each_domain(cpu, tmp) { - if (!(tmp->flags & SD_LOAD_BALANCE)) - break; - /* * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. @@ -8702,8 +8704,7 @@ static bool update_pick_idlest(struct sched_group *idlest, * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; @@ -9434,7 +9435,7 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - int cpu, balance_cpu = -1; + int cpu; /* * Ensure the balancing environment is consistent; can happen @@ -9455,18 +9456,12 @@ static int should_we_balance(struct lb_env *env) if (!idle_cpu(cpu)) continue; - balance_cpu = cpu; - break; + /* Are we the first idle CPU? */ + return cpu == env->dst_cpu; } - if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); - - /* - * First idle CPU or the first CPU(busiest) in this sched group - * is eligible for doing load balancing at this and above domains. - */ - return balance_cpu == env->dst_cpu; + /* Are we the first CPU of this group ? */ + return group_balance_cpu(sg) == env->dst_cpu; } /* @@ -9819,9 +9814,8 @@ static int active_load_balance_cpu_stop(void *data) /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; } if (likely(sd)) { @@ -9910,9 +9904,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) } max_cost += sd->max_newidle_lb_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more @@ -10034,12 +10025,11 @@ static void kick_ilb(unsigned int flags) return; /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target CPU which + * This way we generate an IPI on the target CPU which * is idle. And the softirq performing nohz idle load balance * will be run before returning from the IPI. */ - smp_send_reschedule(ilb_cpu); + smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); } /* @@ -10450,7 +10440,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -10501,9 +10491,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int continue_balancing = 1; u64 t0, domain_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); break; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index db3a57675ccf..21416b30c520 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -349,7 +349,6 @@ struct cfs_bandwidth { u8 idle; u8 period_active; - u8 distribute_running; u8 slack_started; struct hrtimer period_timer; struct hrtimer slack_timer; @@ -890,9 +889,10 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; + call_single_data_t nohz_csd; #endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; - atomic_t nohz_flags; + atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ unsigned long nr_load_updates; @@ -979,7 +979,7 @@ struct rq { /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1021,6 +1021,7 @@ struct rq { #endif #ifdef CONFIG_SMP + call_single_data_t wake_csd; struct llist_head wake_list; #endif @@ -1504,14 +1505,10 @@ static inline void unregister_sched_domain_sysctl(void) } #endif -extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); - #else static inline void sched_ttwu_pending(void) { } -static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } - #endif /* CONFIG_SMP */ #include "stats.h" diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..1d7b446fac7d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); - return -1; - } - printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); @@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | + if (sd->flags & (SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | @@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + pflags &= ~(SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | + SD_SHARE_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -1351,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl, .cache_nice_tries = 0, - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE + .flags = 1*SD_BALANCE_NEWIDLE | 1*SD_BALANCE_EXEC | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE |