diff options
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | include/linux/stop_machine.h | 7 | ||||
-rw-r--r-- | include/linux/wait.h | 30 | ||||
-rw-r--r-- | kernel/sched/core.c | 17 | ||||
-rw-r--r-- | kernel/sched/fair.c | 216 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 1 | ||||
-rw-r--r-- | kernel/sched/sched.h | 8 | ||||
-rw-r--r-- | kernel/stop_machine.c | 84 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 8 |
9 files changed, 227 insertions, 148 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index edad7a43edea..f425aac63317 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -177,9 +177,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void update_cpu_load_nohz(void); +extern void update_cpu_load_nohz(int active); #else -static inline void update_cpu_load_nohz(void) { } +static inline void update_cpu_load_nohz(int active) { } #endif extern unsigned long get_parent_ip(unsigned long addr); diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 0adedca24c5b..9ef42e1f0b3a 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -29,7 +29,7 @@ struct cpu_stop_work { int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); -void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, +bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); @@ -65,7 +65,7 @@ static void stop_one_cpu_nowait_workfn(struct work_struct *work) preempt_enable(); } -static inline void stop_one_cpu_nowait(unsigned int cpu, +static inline bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { @@ -74,7 +74,10 @@ static inline void stop_one_cpu_nowait(unsigned int cpu, work_buf->fn = fn; work_buf->arg = arg; schedule_work(&work_buf->work); + return true; } + + return false; } static inline int stop_cpus(const struct cpumask *cpumask, diff --git a/include/linux/wait.h b/include/linux/wait.h index 1e1bf9f963a9..f3bac30587f7 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -102,6 +102,36 @@ init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) q->func = func; } +/** + * waitqueue_active -- locklessly test for waiters on the queue + * @q: the waitqueue to test for waiters + * + * returns true if the wait list is not empty + * + * NOTE: this function is lockless and requires care, incorrect usage _will_ + * lead to sporadic and non-obvious failure. + * + * Use either while holding wait_queue_head_t::lock or when used for wakeups + * with an extra smp_mb() like: + * + * CPU0 - waker CPU1 - waiter + * + * for (;;) { + * @cond = true; prepare_to_wait(&wq, &wait, state); + * smp_mb(); // smp_mb() from set_current_state() + * if (waitqueue_active(wq)) if (@cond) + * wake_up(wq); break; + * schedule(); + * } + * finish_wait(&wq, &wait); + * + * Because without the explicit smp_mb() it's possible for the + * waitqueue_active() load to get hoisted over the @cond store such that we'll + * observe an empty wait list while the waiter might not observe @cond. + * + * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), + * which (when the lock is uncontended) are of roughly equal cost. + */ static inline int waitqueue_active(wait_queue_head_t *q) { return !list_empty(&q->task_list); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7063c6a07440..8969a9a9dab5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -731,7 +731,7 @@ bool sched_can_stop_tick(void) if (current->policy == SCHED_RR) { struct sched_rt_entity *rt_se = ¤t->rt; - return rt_se->run_list.prev == rt_se->run_list.next; + return list_is_singular(&rt_se->run_list); } /* @@ -1071,8 +1071,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new { lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + dequeue_task(rq, p, 0); set_task_cpu(p, new_cpu); raw_spin_unlock(&rq->lock); @@ -1080,8 +1080,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new raw_spin_lock(&rq->lock); BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); return rq; @@ -1274,6 +1274,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !p->on_rq); + /* + * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, + * because schedstat_wait_{start,end} rebase migrating task's wait_start + * time relying on p->on_rq. + */ + WARN_ON_ONCE(p->state == TASK_RUNNING && + p->sched_class == &fair_sched_class && + (p->on_rq && !task_on_rq_migrating(p))); + #ifdef CONFIG_LOCKDEP /* * The caller should hold either p->pi_lock or rq->lock, when changing @@ -1310,9 +1319,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); } else { /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f04fda8f669c..ff8ec8695957 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -738,12 +738,56 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } +#ifdef CONFIG_SCHEDSTATS static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); + u64 wait_start = rq_clock(rq_of(cfs_rq)); + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && + likely(wait_start > se->statistics.wait_start)) + wait_start -= se->statistics.wait_start; + + se->statistics.wait_start = wait_start; } +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *p; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + + if (entity_is_task(se)) { + p = task_of(se); + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + se->statistics.wait_start = delta; + return; + } + trace_sched_stat_wait(p, delta); + } + + se->statistics.wait_max = max(se->statistics.wait_max, delta); + se->statistics.wait_count++; + se->statistics.wait_sum += delta; + se->statistics.wait_start = 0; +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} +#endif + /* * Task is being enqueued - update stats: */ @@ -757,23 +801,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_start(cfs_rq, se); } -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - trace_sched_stat_wait(task_of(se), - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); - } -#endif - schedstat_set(se->statistics.wait_start, 0); -} - static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2155,6 +2182,7 @@ void task_numa_work(struct callback_head *work) unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; + u64 runtime = p->se.sum_exec_runtime; struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2277,6 +2305,17 @@ out: else reset_ptenuma_scan(p); up_read(&mm->mmap_sem); + + /* + * Make sure tasks use at least 32x as much time to run other code + * than they used here, to limit NUMA PTE scanning overhead to 3% max. + * Usually update_task_scan_period slows down scanning enough; on an + * overloaded system we need to limit overhead on a per task basis. + */ + if (unlikely(p->se.sum_exec_runtime != runtime)) { + u64 diff = p->se.sum_exec_runtime - runtime; + p->node_stamp += 32 * diff; + } } /* @@ -2835,24 +2874,6 @@ void remove_entity_load_avg(struct sched_entity *se) atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } -/* - * Update the rq's load with the elapsed running time before entering - * idle. if the last scheduled task is not a CFS task, idle_enter will - * be the only way to update the runnable statistic. - */ -void idle_enter_fair(struct rq *this_rq) -{ -} - -/* - * Update the rq's load with the elapsed idle time before a task is - * scheduled. if the newly scheduled task is not a CFS task, idle_exit will - * be the only way to update the runnable statistic. - */ -void idle_exit_fair(struct rq *this_rq) -{ -} - static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) { return cfs_rq->runnable_load_avg; @@ -4240,42 +4261,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ /* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * The exact cpuload calculated at every tick would be: + * + * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * If a cpu misses updates for n ticks (as it was idle) and update gets + * called on the n+1-th tick when cpu may be busy, then we have: + * + * load_n = (1 - 1/2^i)^n * load_0 + * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load * * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * load' = (1 - 1/2^i)^n * load + * + * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. + * This allows us to precompute the above in said factors, thereby allowing the + * reduction of an arbitrary n in O(log_2 n) steps. (See also + * fixed_power_int()) * * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. */ #define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; + +static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 64, 32, 8, 0, 0, 0, 0, 0 }, + { 96, 72, 40, 12, 1, 0, 0, 0 }, + { 112, 98, 75, 43, 15, 1, 0, 0 }, + { 120, 112, 98, 76, 45, 16, 2, 0 } +}; /* * Update cpu_load for any missed ticks, due to tickless idle. The backlog @@ -4306,14 +4322,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) return load; } -/* +/** + * __update_cpu_load - update the rq->cpu_load[] statistics + * @this_rq: The rq to update statistics for + * @this_load: The current load + * @pending_updates: The number of missed updates + * @active: !0 for NOHZ_FULL + * * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. + * scheduler tick (TICK_NSEC). + * + * This function computes a decaying average: + * + * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load + * + * Because of NOHZ it might not get called on every tick which gives need for + * the @pending_updates argument. + * + * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 + * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load + * = A * (A * load[i]_n-2 + B) + B + * = A * (A * (A * load[i]_n-3 + B) + B) + B + * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B + * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B + * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B + * = (1 - 1/2^i)^n * (load[i]_0 - load) + load + * + * In the above we've assumed load_n := load, which is true for NOHZ_FULL as + * any change in load would have resulted in the tick being turned back on. + * + * For regular NOHZ, this reduces to: + * + * load[i]_n = (1 - 1/2^i)^n * load[i]_0 + * + * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra + * term. See the @active paramter. */ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) + unsigned long pending_updates, int active) { + unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; int i, scale; this_rq->nr_load_updates++; @@ -4325,8 +4373,9 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, /* scale is effectively 1 << i now, and >> i divides by scale */ - old_load = this_rq->cpu_load[i]; + old_load = this_rq->cpu_load[i] - tickless_load; old_load = decay_load_missed(old_load, pending_updates - 1, i); + old_load += tickless_load; new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -4381,16 +4430,17 @@ static void update_idle_cpu_load(struct rq *this_rq) pending_updates = curr_jiffies - this_rq->last_load_update_tick; this_rq->last_load_update_tick = curr_jiffies; - __update_cpu_load(this_rq, load, pending_updates); + __update_cpu_load(this_rq, load, pending_updates, 0); } /* * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. */ -void update_cpu_load_nohz(void) +void update_cpu_load_nohz(int active) { struct rq *this_rq = this_rq(); unsigned long curr_jiffies = READ_ONCE(jiffies); + unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; unsigned long pending_updates; if (curr_jiffies == this_rq->last_load_update_tick) @@ -4401,10 +4451,11 @@ void update_cpu_load_nohz(void) if (pending_updates) { this_rq->last_load_update_tick = curr_jiffies; /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. + * In the regular NOHZ case, we were idle, this means load 0. + * In the NOHZ_FULL case, we were non-idle, we should consider + * its weighted load. */ - __update_cpu_load(this_rq, 0, pending_updates); + __update_cpu_load(this_rq, load, pending_updates, active); } raw_spin_unlock(&this_rq->lock); } @@ -4420,7 +4471,7 @@ void update_cpu_load_active(struct rq *this_rq) * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); + __update_cpu_load(this_rq, load, 1, 1); } /* @@ -5007,8 +5058,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the - * previous cpu. However, the caller only guarantees p->pi_lock is held; no - * other assumptions, including the state of rq->lock, should be made. + * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ static void migrate_task_rq_fair(struct task_struct *p) { @@ -5721,8 +5771,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(env->src_rq, p, 0); set_task_cpu(p, env->dst_cpu); } @@ -5855,8 +5905,8 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -7248,8 +7298,6 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; - idle_enter_fair(this_rq); - /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -7330,10 +7378,8 @@ out: if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; - if (pulled_task) { - idle_exit_fair(this_rq); + if (pulled_task) this_rq->idle_stamp = 0; - } return pulled_task; } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c4ae0f1fdf9b..47ce94931f1b 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { - idle_exit_fair(rq); rq_last_tick_reset(rq); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b242775bf670..cdae23dabfdc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1252,16 +1252,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void idle_enter_fair(struct rq *this_rq); -extern void idle_exit_fair(struct rq *this_rq); - extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); -#else - -static inline void idle_enter_fair(struct rq *rq) { } -static inline void idle_exit_fair(struct rq *rq) { } - #endif #ifdef CONFIG_CPU_IDLE diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 867bc20e1ef1..61101193967e 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -28,7 +28,6 @@ */ struct cpu_stop_done { atomic_t nr_todo; /* nr left to execute */ - bool executed; /* actually executed? */ int ret; /* collected return value */ struct completion completion; /* fired if nr_todo reaches 0 */ }; @@ -63,14 +62,10 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) } /* signal completion unless @done is NULL */ -static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) +static void cpu_stop_signal_done(struct cpu_stop_done *done) { - if (done) { - if (executed) - done->executed = true; - if (atomic_dec_and_test(&done->nr_todo)) - complete(&done->completion); - } + if (atomic_dec_and_test(&done->nr_todo)) + complete(&done->completion); } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, @@ -81,17 +76,21 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper, } /* queue @work to @stopper. if offline, @work is completed immediately */ -static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) +static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); unsigned long flags; + bool enabled; spin_lock_irqsave(&stopper->lock, flags); - if (stopper->enabled) + enabled = stopper->enabled; + if (enabled) __cpu_stop_queue_work(stopper, work); - else - cpu_stop_signal_done(work->done, false); + else if (work->done) + cpu_stop_signal_done(work->done); spin_unlock_irqrestore(&stopper->lock, flags); + + return enabled; } /** @@ -124,9 +123,10 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; cpu_stop_init_done(&done, 1); - cpu_stop_queue_work(cpu, &work); + if (!cpu_stop_queue_work(cpu, &work)) + return -ENOENT; wait_for_completion(&done.completion); - return done.executed ? done.ret : -ENOENT; + return done.ret; } /* This controls the threads on each CPU. */ @@ -258,7 +258,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * struct cpu_stop_work work1, work2; struct multi_stop_data msdata; - preempt_disable(); msdata = (struct multi_stop_data){ .fn = fn, .data = arg, @@ -277,16 +276,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * if (cpu1 > cpu2) swap(cpu1, cpu2); - if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { - preempt_enable(); + if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) return -ENOENT; - } - - preempt_enable(); wait_for_completion(&done.completion); - - return done.executed ? done.ret : -ENOENT; + return done.ret; } /** @@ -302,23 +296,28 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * * * CONTEXT: * Don't care. + * + * RETURNS: + * true if cpu_stop_work was queued successfully and @fn will be called, + * false otherwise. */ -void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, +bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; - cpu_stop_queue_work(cpu, work_buf); + return cpu_stop_queue_work(cpu, work_buf); } /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); -static void queue_stop_cpus_work(const struct cpumask *cpumask, +static bool queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, struct cpu_stop_done *done) { struct cpu_stop_work *work; unsigned int cpu; + bool queued = false; /* * Disable preemption while queueing to avoid getting @@ -331,9 +330,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, work->fn = fn; work->arg = arg; work->done = done; - cpu_stop_queue_work(cpu, work); + if (cpu_stop_queue_work(cpu, work)) + queued = true; } lg_global_unlock(&stop_cpus_lock); + + return queued; } static int __stop_cpus(const struct cpumask *cpumask, @@ -342,9 +344,10 @@ static int __stop_cpus(const struct cpumask *cpumask, struct cpu_stop_done done; cpu_stop_init_done(&done, cpumask_weight(cpumask)); - queue_stop_cpus_work(cpumask, fn, arg, &done); + if (!queue_stop_cpus_work(cpumask, fn, arg, &done)) + return -ENOENT; wait_for_completion(&done.completion); - return done.executed ? done.ret : -ENOENT; + return done.ret; } /** @@ -432,7 +435,6 @@ static void cpu_stopper_thread(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct cpu_stop_work *work; - int ret; repeat: work = NULL; @@ -448,23 +450,19 @@ repeat: cpu_stop_fn_t fn = work->fn; void *arg = work->arg; struct cpu_stop_done *done = work->done; - char ksym_buf[KSYM_NAME_LEN] __maybe_unused; - - /* cpu stop callbacks are not allowed to sleep */ - preempt_disable(); + int ret; + /* cpu stop callbacks must not sleep, make in_atomic() == T */ + preempt_count_inc(); ret = fn(arg); - if (ret) - done->ret = ret; - - /* restore preemption and check it's still balanced */ - preempt_enable(); + if (done) { + if (ret) + done->ret = ret; + cpu_stop_signal_done(done); + } + preempt_count_dec(); WARN_ONCE(preempt_count(), - "cpu_stop: %s(%p) leaked preempt count\n", - kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL, - ksym_buf), arg); - - cpu_stop_signal_done(done, true); + "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); goto repeat; } } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7c7ec4515983..515edf3eb62b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -694,11 +694,11 @@ out: return tick; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active) { /* Update jiffies first */ tick_do_update_jiffies64(now); - update_cpu_load_nohz(); + update_cpu_load_nohz(active); calc_load_exit_idle(); touch_softlockup_watchdog(); @@ -725,7 +725,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (can_stop_full_tick()) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) - tick_nohz_restart_sched_tick(ts, ktime_get()); + tick_nohz_restart_sched_tick(ts, ktime_get(), 1); #endif } @@ -916,7 +916,7 @@ void tick_nohz_idle_exit(void) tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now); + tick_nohz_restart_sched_tick(ts, now, 0); tick_nohz_account_idle_ticks(ts); } |