aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/stop_machine.h7
-rw-r--r--include/linux/wait.h30
-rw-r--r--kernel/sched/core.c17
-rw-r--r--kernel/sched/fair.c216
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/sched.h8
-rw-r--r--kernel/stop_machine.c84
-rw-r--r--kernel/time/tick-sched.c8
9 files changed, 227 insertions, 148 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index edad7a43edea..f425aac63317 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -177,9 +177,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
extern void calc_global_load(unsigned long ticks);
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(void);
+extern void update_cpu_load_nohz(int active);
#else
-static inline void update_cpu_load_nohz(void) { }
+static inline void update_cpu_load_nohz(int active) { }
#endif
extern unsigned long get_parent_ip(unsigned long addr);
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 0adedca24c5b..9ef42e1f0b3a 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -29,7 +29,7 @@ struct cpu_stop_work {
int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg);
-void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf);
int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
@@ -65,7 +65,7 @@ static void stop_one_cpu_nowait_workfn(struct work_struct *work)
preempt_enable();
}
-static inline void stop_one_cpu_nowait(unsigned int cpu,
+static inline bool stop_one_cpu_nowait(unsigned int cpu,
cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf)
{
@@ -74,7 +74,10 @@ static inline void stop_one_cpu_nowait(unsigned int cpu,
work_buf->fn = fn;
work_buf->arg = arg;
schedule_work(&work_buf->work);
+ return true;
}
+
+ return false;
}
static inline int stop_cpus(const struct cpumask *cpumask,
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f963a9..f3bac30587f7 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -102,6 +102,36 @@ init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
q->func = func;
}
+/**
+ * waitqueue_active -- locklessly test for waiters on the queue
+ * @q: the waitqueue to test for waiters
+ *
+ * returns true if the wait list is not empty
+ *
+ * NOTE: this function is lockless and requires care, incorrect usage _will_
+ * lead to sporadic and non-obvious failure.
+ *
+ * Use either while holding wait_queue_head_t::lock or when used for wakeups
+ * with an extra smp_mb() like:
+ *
+ * CPU0 - waker CPU1 - waiter
+ *
+ * for (;;) {
+ * @cond = true; prepare_to_wait(&wq, &wait, state);
+ * smp_mb(); // smp_mb() from set_current_state()
+ * if (waitqueue_active(wq)) if (@cond)
+ * wake_up(wq); break;
+ * schedule();
+ * }
+ * finish_wait(&wq, &wait);
+ *
+ * Because without the explicit smp_mb() it's possible for the
+ * waitqueue_active() load to get hoisted over the @cond store such that we'll
+ * observe an empty wait list while the waiter might not observe @cond.
+ *
+ * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
+ * which (when the lock is uncontended) are of roughly equal cost.
+ */
static inline int waitqueue_active(wait_queue_head_t *q)
{
return !list_empty(&q->task_list);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7063c6a07440..8969a9a9dab5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -731,7 +731,7 @@ bool sched_can_stop_tick(void)
if (current->policy == SCHED_RR) {
struct sched_rt_entity *rt_se = &current->rt;
- return rt_se->run_list.prev == rt_se->run_list.next;
+ return list_is_singular(&rt_se->run_list);
}
/*
@@ -1071,8 +1071,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
{
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ dequeue_task(rq, p, 0);
set_task_cpu(p, new_cpu);
raw_spin_unlock(&rq->lock);
@@ -1080,8 +1080,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
raw_spin_lock(&rq->lock);
BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
return rq;
@@ -1274,6 +1274,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!p->on_rq);
+ /*
+ * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+ * because schedstat_wait_{start,end} rebase migrating task's wait_start
+ * time relying on p->on_rq.
+ */
+ WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ p->sched_class == &fair_sched_class &&
+ (p->on_rq && !task_on_rq_migrating(p)));
+
#ifdef CONFIG_LOCKDEP
/*
* The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1310,9 +1319,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
} else {
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f04fda8f669c..ff8ec8695957 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -738,12 +738,56 @@ static void update_curr_fair(struct rq *rq)
update_curr(cfs_rq_of(&rq->curr->se));
}
+#ifdef CONFIG_SCHEDSTATS
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+ u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+ likely(wait_start > se->statistics.wait_start))
+ wait_start -= se->statistics.wait_start;
+
+ se->statistics.wait_start = wait_start;
}
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *p;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ se->statistics.wait_start = delta;
+ return;
+ }
+ trace_sched_stat_wait(p, delta);
+ }
+
+ se->statistics.wait_max = max(se->statistics.wait_max, delta);
+ se->statistics.wait_count++;
+ se->statistics.wait_sum += delta;
+ se->statistics.wait_start = 0;
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
/*
* Task is being enqueued - update stats:
*/
@@ -757,23 +801,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_start(cfs_rq, se);
}
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
- schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
- schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- trace_sched_stat_wait(task_of(se),
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
- }
-#endif
- schedstat_set(se->statistics.wait_start, 0);
-}
-
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -2155,6 +2182,7 @@ void task_numa_work(struct callback_head *work)
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ u64 runtime = p->se.sum_exec_runtime;
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
@@ -2277,6 +2305,17 @@ out:
else
reset_ptenuma_scan(p);
up_read(&mm->mmap_sem);
+
+ /*
+ * Make sure tasks use at least 32x as much time to run other code
+ * than they used here, to limit NUMA PTE scanning overhead to 3% max.
+ * Usually update_task_scan_period slows down scanning enough; on an
+ * overloaded system we need to limit overhead on a per task basis.
+ */
+ if (unlikely(p->se.sum_exec_runtime != runtime)) {
+ u64 diff = p->se.sum_exec_runtime - runtime;
+ p->node_stamp += 32 * diff;
+ }
}
/*
@@ -2835,24 +2874,6 @@ void remove_entity_load_avg(struct sched_entity *se)
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
-/*
- * Update the rq's load with the elapsed running time before entering
- * idle. if the last scheduled task is not a CFS task, idle_enter will
- * be the only way to update the runnable statistic.
- */
-void idle_enter_fair(struct rq *this_rq)
-{
-}
-
-/*
- * Update the rq's load with the elapsed idle time before a task is
- * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
- * be the only way to update the runnable statistic.
- */
-void idle_exit_fair(struct rq *this_rq)
-{
-}
-
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->runnable_load_avg;
@@ -4240,42 +4261,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ * The exact cpuload calculated at every tick would be:
+ *
+ * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
*
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ * If a cpu misses updates for n ticks (as it was idle) and update gets
+ * called on the n+1-th tick when cpu may be busy, then we have:
+ *
+ * load_n = (1 - 1/2^i)^n * load_0
+ * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
*
* decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * load' = (1 - 1/2^i)^n * load
+ *
+ * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
+ * This allows us to precompute the above in said factors, thereby allowing the
+ * reduction of an arbitrary n in O(log_2 n) steps. (See also
+ * fixed_power_int())
*
* The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
*/
#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
+
+static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 64, 32, 8, 0, 0, 0, 0, 0 },
+ { 96, 72, 40, 12, 1, 0, 0, 0 },
+ { 112, 98, 75, 43, 15, 1, 0, 0 },
+ { 120, 112, 98, 76, 45, 16, 2, 0 }
+};
/*
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
@@ -4306,14 +4322,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
return load;
}
-/*
+/**
+ * __update_cpu_load - update the rq->cpu_load[] statistics
+ * @this_rq: The rq to update statistics for
+ * @this_load: The current load
+ * @pending_updates: The number of missed updates
+ * @active: !0 for NOHZ_FULL
+ *
* Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
+ * scheduler tick (TICK_NSEC).
+ *
+ * This function computes a decaying average:
+ *
+ * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
+ *
+ * Because of NOHZ it might not get called on every tick which gives need for
+ * the @pending_updates argument.
+ *
+ * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
+ * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
+ * = A * (A * load[i]_n-2 + B) + B
+ * = A * (A * (A * load[i]_n-3 + B) + B) + B
+ * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
+ * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
+ * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
+ * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
+ *
+ * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
+ * any change in load would have resulted in the tick being turned back on.
+ *
+ * For regular NOHZ, this reduces to:
+ *
+ * load[i]_n = (1 - 1/2^i)^n * load[i]_0
+ *
+ * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
+ * term. See the @active paramter.
*/
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
+ unsigned long pending_updates, int active)
{
+ unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
int i, scale;
this_rq->nr_load_updates++;
@@ -4325,8 +4373,9 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
- old_load = this_rq->cpu_load[i];
+ old_load = this_rq->cpu_load[i] - tickless_load;
old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ old_load += tickless_load;
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
@@ -4381,16 +4430,17 @@ static void update_idle_cpu_load(struct rq *this_rq)
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
this_rq->last_load_update_tick = curr_jiffies;
- __update_cpu_load(this_rq, load, pending_updates);
+ __update_cpu_load(this_rq, load, pending_updates, 0);
}
/*
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
*/
-void update_cpu_load_nohz(void)
+void update_cpu_load_nohz(int active)
{
struct rq *this_rq = this_rq();
unsigned long curr_jiffies = READ_ONCE(jiffies);
+ unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
unsigned long pending_updates;
if (curr_jiffies == this_rq->last_load_update_tick)
@@ -4401,10 +4451,11 @@ void update_cpu_load_nohz(void)
if (pending_updates) {
this_rq->last_load_update_tick = curr_jiffies;
/*
- * We were idle, this means load 0, the current load might be
- * !0 due to remote wakeups and the sort.
+ * In the regular NOHZ case, we were idle, this means load 0.
+ * In the NOHZ_FULL case, we were non-idle, we should consider
+ * its weighted load.
*/
- __update_cpu_load(this_rq, 0, pending_updates);
+ __update_cpu_load(this_rq, load, pending_updates, active);
}
raw_spin_unlock(&this_rq->lock);
}
@@ -4420,7 +4471,7 @@ void update_cpu_load_active(struct rq *this_rq)
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
*/
this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, load, 1);
+ __update_cpu_load(this_rq, load, 1, 1);
}
/*
@@ -5007,8 +5058,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu. However, the caller only guarantees p->pi_lock is held; no
- * other assumptions, including the state of rq->lock, should be made.
+ * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
static void migrate_task_rq_fair(struct task_struct *p)
{
@@ -5721,8 +5771,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_held(&env->src_rq->lock);
- deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(env->src_rq, p, 0);
set_task_cpu(p, env->dst_cpu);
}
@@ -5855,8 +5905,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
- p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
@@ -7248,8 +7298,6 @@ static int idle_balance(struct rq *this_rq)
int pulled_task = 0;
u64 curr_cost = 0;
- idle_enter_fair(this_rq);
-
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -7330,10 +7378,8 @@ out:
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
- if (pulled_task) {
- idle_exit_fair(this_rq);
+ if (pulled_task)
this_rq->idle_stamp = 0;
- }
return pulled_task;
}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c4ae0f1fdf9b..47ce94931f1b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
- idle_exit_fair(rq);
rq_last_tick_reset(rq);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b242775bf670..cdae23dabfdc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1252,16 +1252,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
-extern void idle_enter_fair(struct rq *this_rq);
-extern void idle_exit_fair(struct rq *this_rq);
-
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
-#else
-
-static inline void idle_enter_fair(struct rq *rq) { }
-static inline void idle_exit_fair(struct rq *rq) { }
-
#endif
#ifdef CONFIG_CPU_IDLE
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 867bc20e1ef1..61101193967e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -28,7 +28,6 @@
*/
struct cpu_stop_done {
atomic_t nr_todo; /* nr left to execute */
- bool executed; /* actually executed? */
int ret; /* collected return value */
struct completion completion; /* fired if nr_todo reaches 0 */
};
@@ -63,14 +62,10 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
}
/* signal completion unless @done is NULL */
-static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+static void cpu_stop_signal_done(struct cpu_stop_done *done)
{
- if (done) {
- if (executed)
- done->executed = true;
- if (atomic_dec_and_test(&done->nr_todo))
- complete(&done->completion);
- }
+ if (atomic_dec_and_test(&done->nr_todo))
+ complete(&done->completion);
}
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
@@ -81,17 +76,21 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
}
/* queue @work to @stopper. if offline, @work is completed immediately */
-static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
unsigned long flags;
+ bool enabled;
spin_lock_irqsave(&stopper->lock, flags);
- if (stopper->enabled)
+ enabled = stopper->enabled;
+ if (enabled)
__cpu_stop_queue_work(stopper, work);
- else
- cpu_stop_signal_done(work->done, false);
+ else if (work->done)
+ cpu_stop_signal_done(work->done);
spin_unlock_irqrestore(&stopper->lock, flags);
+
+ return enabled;
}
/**
@@ -124,9 +123,10 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
cpu_stop_init_done(&done, 1);
- cpu_stop_queue_work(cpu, &work);
+ if (!cpu_stop_queue_work(cpu, &work))
+ return -ENOENT;
wait_for_completion(&done.completion);
- return done.executed ? done.ret : -ENOENT;
+ return done.ret;
}
/* This controls the threads on each CPU. */
@@ -258,7 +258,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
struct cpu_stop_work work1, work2;
struct multi_stop_data msdata;
- preempt_disable();
msdata = (struct multi_stop_data){
.fn = fn,
.data = arg,
@@ -277,16 +276,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
if (cpu1 > cpu2)
swap(cpu1, cpu2);
- if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
- preempt_enable();
+ if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2))
return -ENOENT;
- }
-
- preempt_enable();
wait_for_completion(&done.completion);
-
- return done.executed ? done.ret : -ENOENT;
+ return done.ret;
}
/**
@@ -302,23 +296,28 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
*
* CONTEXT:
* Don't care.
+ *
+ * RETURNS:
+ * true if cpu_stop_work was queued successfully and @fn will be called,
+ * false otherwise.
*/
-void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf)
{
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
- cpu_stop_queue_work(cpu, work_buf);
+ return cpu_stop_queue_work(cpu, work_buf);
}
/* static data for stop_cpus */
static DEFINE_MUTEX(stop_cpus_mutex);
-static void queue_stop_cpus_work(const struct cpumask *cpumask,
+static bool queue_stop_cpus_work(const struct cpumask *cpumask,
cpu_stop_fn_t fn, void *arg,
struct cpu_stop_done *done)
{
struct cpu_stop_work *work;
unsigned int cpu;
+ bool queued = false;
/*
* Disable preemption while queueing to avoid getting
@@ -331,9 +330,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
work->fn = fn;
work->arg = arg;
work->done = done;
- cpu_stop_queue_work(cpu, work);
+ if (cpu_stop_queue_work(cpu, work))
+ queued = true;
}
lg_global_unlock(&stop_cpus_lock);
+
+ return queued;
}
static int __stop_cpus(const struct cpumask *cpumask,
@@ -342,9 +344,10 @@ static int __stop_cpus(const struct cpumask *cpumask,
struct cpu_stop_done done;
cpu_stop_init_done(&done, cpumask_weight(cpumask));
- queue_stop_cpus_work(cpumask, fn, arg, &done);
+ if (!queue_stop_cpus_work(cpumask, fn, arg, &done))
+ return -ENOENT;
wait_for_completion(&done.completion);
- return done.executed ? done.ret : -ENOENT;
+ return done.ret;
}
/**
@@ -432,7 +435,6 @@ static void cpu_stopper_thread(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
struct cpu_stop_work *work;
- int ret;
repeat:
work = NULL;
@@ -448,23 +450,19 @@ repeat:
cpu_stop_fn_t fn = work->fn;
void *arg = work->arg;
struct cpu_stop_done *done = work->done;
- char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
-
- /* cpu stop callbacks are not allowed to sleep */
- preempt_disable();
+ int ret;
+ /* cpu stop callbacks must not sleep, make in_atomic() == T */
+ preempt_count_inc();
ret = fn(arg);
- if (ret)
- done->ret = ret;
-
- /* restore preemption and check it's still balanced */
- preempt_enable();
+ if (done) {
+ if (ret)
+ done->ret = ret;
+ cpu_stop_signal_done(done);
+ }
+ preempt_count_dec();
WARN_ONCE(preempt_count(),
- "cpu_stop: %s(%p) leaked preempt count\n",
- kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
- ksym_buf), arg);
-
- cpu_stop_signal_done(done, true);
+ "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
goto repeat;
}
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7c7ec4515983..515edf3eb62b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -694,11 +694,11 @@ out:
return tick;
}
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active)
{
/* Update jiffies first */
tick_do_update_jiffies64(now);
- update_cpu_load_nohz();
+ update_cpu_load_nohz(active);
calc_load_exit_idle();
touch_softlockup_watchdog();
@@ -725,7 +725,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
if (can_stop_full_tick())
tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
else if (ts->tick_stopped)
- tick_nohz_restart_sched_tick(ts, ktime_get());
+ tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
#endif
}
@@ -916,7 +916,7 @@ void tick_nohz_idle_exit(void)
tick_nohz_stop_idle(ts, now);
if (ts->tick_stopped) {
- tick_nohz_restart_sched_tick(ts, now);
+ tick_nohz_restart_sched_tick(ts, now, 0);
tick_nohz_account_idle_ticks(ts);
}