aboutsummaryrefslogtreecommitdiff
path: root/kernel/rcu
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu')
-rw-r--r--kernel/rcu/Kconfig18
-rw-r--r--kernel/rcu/rcu.h16
-rw-r--r--kernel/rcu/rcuscale.c282
-rw-r--r--kernel/rcu/rcutorture.c7
-rw-r--r--kernel/rcu/refscale.c37
-rw-r--r--kernel/rcu/tasks.h148
-rw-r--r--kernel/rcu/tree.c147
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_nocb.h56
-rw-r--r--kernel/rcu/tree_plugin.h4
-rw-r--r--kernel/rcu/tree_stall.h2
11 files changed, 526 insertions, 193 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 9071182b1284..bdd7eadb33d8 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -314,4 +314,22 @@ config RCU_LAZY
To save power, batch RCU callbacks and flush after delay, memory
pressure, or callback list growing too big.
+config RCU_DOUBLE_CHECK_CB_TIME
+ bool "RCU callback-batch backup time check"
+ depends on RCU_EXPERT
+ default n
+ help
+ Use this option to provide more precise enforcement of the
+ rcutree.rcu_resched_ns module parameter in situations where
+ a single RCU callback might run for hundreds of microseconds,
+ thus defeating the 32-callback batching used to amortize the
+ cost of the fine-grained but expensive local_clock() function.
+
+ This option rounds rcutree.rcu_resched_ns up to the next
+ jiffy, and overrides the 32-callback batching if this limit
+ is exceeded.
+
+ Say Y here if you need tighter callback-limit enforcement.
+ Say N here if you are unsure.
+
endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 4a1b9622598b..98e13be411af 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -493,7 +493,6 @@ static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
static inline void rcu_async_hurry(void) { }
static inline void rcu_async_relax(void) { }
-static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
@@ -508,9 +507,16 @@ void show_rcu_tasks_gp_kthreads(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void show_rcu_tasks_gp_kthreads(void) {}
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
-void rcu_request_urgent_qs_task(struct task_struct *t);
#endif /* #else #ifdef CONFIG_TINY_RCU */
+#ifdef CONFIG_TASKS_RCU
+struct task_struct *get_rcu_tasks_gp_kthread(void);
+#endif // # ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
+#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
@@ -642,4 +648,10 @@ void show_rcu_tasks_trace_gp_kthread(void);
static inline void show_rcu_tasks_trace_gp_kthread(void) {}
#endif
+#ifdef CONFIG_TINY_RCU
+static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
+#else
+bool rcu_cpu_beenfullyonline(int cpu);
+#endif
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index e82ec9f9a5d8..ffdb30495e3c 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -84,15 +84,17 @@ MODULE_AUTHOR("Paul E. McKenney <[email protected]>");
#endif
torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
-torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
+torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
+torture_param(int, minruntime, 0, "Minimum run time (s)");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
torture_param(bool, shutdown, RCUSCALE_SHUTDOWN,
"Shutdown at end of scalability tests.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
+torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable");
torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?");
torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?");
@@ -139,6 +141,7 @@ struct rcu_scale_ops {
void (*gp_barrier)(void);
void (*sync)(void);
void (*exp_sync)(void);
+ struct task_struct *(*rso_gp_kthread)(void);
const char *name;
};
@@ -295,6 +298,7 @@ static struct rcu_scale_ops tasks_ops = {
.gp_barrier = rcu_barrier_tasks,
.sync = synchronize_rcu_tasks,
.exp_sync = synchronize_rcu_tasks,
+ .rso_gp_kthread = get_rcu_tasks_gp_kthread,
.name = "tasks"
};
@@ -306,6 +310,44 @@ static struct rcu_scale_ops tasks_ops = {
#endif // #else // #ifdef CONFIG_TASKS_RCU
+#ifdef CONFIG_TASKS_RUDE_RCU
+
+/*
+ * Definitions for RCU-tasks-rude scalability testing.
+ */
+
+static int tasks_rude_scale_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_rude_scale_read_unlock(int idx)
+{
+}
+
+static struct rcu_scale_ops tasks_rude_ops = {
+ .ptype = RCU_TASKS_RUDE_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = tasks_rude_scale_read_lock,
+ .readunlock = tasks_rude_scale_read_unlock,
+ .get_gp_seq = rcu_no_completed,
+ .gp_diff = rcu_seq_diff,
+ .async = call_rcu_tasks_rude,
+ .gp_barrier = rcu_barrier_tasks_rude,
+ .sync = synchronize_rcu_tasks_rude,
+ .exp_sync = synchronize_rcu_tasks_rude,
+ .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread,
+ .name = "tasks-rude"
+};
+
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RUDE_RCU
+
#ifdef CONFIG_TASKS_TRACE_RCU
/*
@@ -334,6 +376,7 @@ static struct rcu_scale_ops tasks_tracing_ops = {
.gp_barrier = rcu_barrier_tasks_trace,
.sync = synchronize_rcu_tasks_trace,
.exp_sync = synchronize_rcu_tasks_trace,
+ .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread,
.name = "tasks-tracing"
};
@@ -410,10 +453,12 @@ rcu_scale_writer(void *arg)
{
int i = 0;
int i_max;
+ unsigned long jdone;
long me = (long)arg;
struct rcu_head *rhp = NULL;
bool started = false, done = false, alldone = false;
u64 t;
+ DEFINE_TORTURE_RANDOM(tr);
u64 *wdp;
u64 *wdpp = writer_durations[me];
@@ -424,7 +469,7 @@ rcu_scale_writer(void *arg)
sched_set_fifo_low(current);
if (holdoff)
- schedule_timeout_uninterruptible(holdoff * HZ);
+ schedule_timeout_idle(holdoff * HZ);
/*
* Wait until rcu_end_inkernel_boot() is called for normal GP tests
@@ -445,9 +490,12 @@ rcu_scale_writer(void *arg)
}
}
+ jdone = jiffies + minruntime * HZ;
do {
if (writer_holdoff)
udelay(writer_holdoff);
+ if (writer_holdoff_jiffies)
+ schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1);
wdp = &wdpp[i];
*wdp = ktime_get_mono_fast_ns();
if (gp_async) {
@@ -475,7 +523,7 @@ retry:
if (!started &&
atomic_read(&n_rcu_scale_writer_started) >= nrealwriters)
started = true;
- if (!done && i >= MIN_MEAS) {
+ if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) {
done = true;
sched_set_normal(current, 0);
pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n",
@@ -518,91 +566,8 @@ static void
rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n",
- scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
-}
-
-static void
-rcu_scale_cleanup(void)
-{
- int i;
- int j;
- int ngps = 0;
- u64 *wdp;
- u64 *wdpp;
-
- /*
- * Would like warning at start, but everything is expedited
- * during the mid-boot phase, so have to wait till the end.
- */
- if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
- SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
- if (rcu_gp_is_normal() && gp_exp)
- SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
- if (gp_exp && gp_async)
- SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
-
- if (torture_cleanup_begin())
- return;
- if (!cur_ops) {
- torture_cleanup_end();
- return;
- }
-
- if (reader_tasks) {
- for (i = 0; i < nrealreaders; i++)
- torture_stop_kthread(rcu_scale_reader,
- reader_tasks[i]);
- kfree(reader_tasks);
- }
-
- if (writer_tasks) {
- for (i = 0; i < nrealwriters; i++) {
- torture_stop_kthread(rcu_scale_writer,
- writer_tasks[i]);
- if (!writer_n_durations)
- continue;
- j = writer_n_durations[i];
- pr_alert("%s%s writer %d gps: %d\n",
- scale_type, SCALE_FLAG, i, j);
- ngps += j;
- }
- pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
- scale_type, SCALE_FLAG,
- t_rcu_scale_writer_started, t_rcu_scale_writer_finished,
- t_rcu_scale_writer_finished -
- t_rcu_scale_writer_started,
- ngps,
- rcuscale_seq_diff(b_rcu_gp_test_finished,
- b_rcu_gp_test_started));
- for (i = 0; i < nrealwriters; i++) {
- if (!writer_durations)
- break;
- if (!writer_n_durations)
- continue;
- wdpp = writer_durations[i];
- if (!wdpp)
- continue;
- for (j = 0; j < writer_n_durations[i]; j++) {
- wdp = &wdpp[j];
- pr_alert("%s%s %4d writer-duration: %5d %llu\n",
- scale_type, SCALE_FLAG,
- i, j, *wdp);
- if (j % 100 == 0)
- schedule_timeout_uninterruptible(1);
- }
- kfree(writer_durations[i]);
- }
- kfree(writer_tasks);
- kfree(writer_durations);
- kfree(writer_n_durations);
- }
-
- /* Do torture-type-specific cleanup operations. */
- if (cur_ops->cleanup != NULL)
- cur_ops->cleanup();
-
- torture_cleanup_end();
+ "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n",
+ scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown);
}
/*
@@ -625,20 +590,6 @@ static int compute_real(int n)
}
/*
- * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
- * down system.
- */
-static int
-rcu_scale_shutdown(void *arg)
-{
- wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
- smp_mb(); /* Wake before output. */
- rcu_scale_cleanup();
- kernel_power_off();
- return -EINVAL;
-}
-
-/*
* kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number
* of iterations and measure total time and number of GP for all iterations to complete.
*/
@@ -653,6 +604,8 @@ static struct task_struct **kfree_reader_tasks;
static int kfree_nrealthreads;
static atomic_t n_kfree_scale_thread_started;
static atomic_t n_kfree_scale_thread_ended;
+static struct task_struct *kthread_tp;
+static u64 kthread_stime;
struct kfree_obj {
char kfree_obj[8];
@@ -798,6 +751,10 @@ kfree_scale_init(void)
unsigned long jif_start;
unsigned long orig_jif;
+ pr_alert("%s" SCALE_FLAG
+ "--- kfree_rcu_test: kfree_mult=%d kfree_by_call_rcu=%d kfree_nthreads=%d kfree_alloc_num=%d kfree_loops=%d kfree_rcu_test_double=%d kfree_rcu_test_single=%d\n",
+ scale_type, kfree_mult, kfree_by_call_rcu, kfree_nthreads, kfree_alloc_num, kfree_loops, kfree_rcu_test_double, kfree_rcu_test_single);
+
// Also, do a quick self-test to ensure laziness is as much as
// expected.
if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) {
@@ -874,13 +831,127 @@ unwind:
return firsterr;
}
+static void
+rcu_scale_cleanup(void)
+{
+ int i;
+ int j;
+ int ngps = 0;
+ u64 *wdp;
+ u64 *wdpp;
+
+ /*
+ * Would like warning at start, but everything is expedited
+ * during the mid-boot phase, so have to wait till the end.
+ */
+ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ if (rcu_gp_is_normal() && gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ if (gp_exp && gp_async)
+ SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
+
+ // If built-in, just report all of the GP kthread's CPU time.
+ if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread)
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp) {
+ u32 ns;
+ u64 us;
+
+ kthread_stime = kthread_tp->stime - kthread_stime;
+ us = div_u64_rem(kthread_stime, 1000, &ns);
+ pr_info("rcu_scale: Grace-period kthread CPU time: %llu.%03u us\n", us, ns);
+ show_rcu_gp_kthreads();
+ }
+ if (kfree_rcu_test) {
+ kfree_scale_cleanup();
+ return;
+ }
+
+ if (torture_cleanup_begin())
+ return;
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_scale_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ }
+
+ if (writer_tasks) {
+ for (i = 0; i < nrealwriters; i++) {
+ torture_stop_kthread(rcu_scale_writer,
+ writer_tasks[i]);
+ if (!writer_n_durations)
+ continue;
+ j = writer_n_durations[i];
+ pr_alert("%s%s writer %d gps: %d\n",
+ scale_type, SCALE_FLAG, i, j);
+ ngps += j;
+ }
+ pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
+ scale_type, SCALE_FLAG,
+ t_rcu_scale_writer_started, t_rcu_scale_writer_finished,
+ t_rcu_scale_writer_finished -
+ t_rcu_scale_writer_started,
+ ngps,
+ rcuscale_seq_diff(b_rcu_gp_test_finished,
+ b_rcu_gp_test_started));
+ for (i = 0; i < nrealwriters; i++) {
+ if (!writer_durations)
+ break;
+ if (!writer_n_durations)
+ continue;
+ wdpp = writer_durations[i];
+ if (!wdpp)
+ continue;
+ for (j = 0; j < writer_n_durations[i]; j++) {
+ wdp = &wdpp[j];
+ pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+ scale_type, SCALE_FLAG,
+ i, j, *wdp);
+ if (j % 100 == 0)
+ schedule_timeout_uninterruptible(1);
+ }
+ kfree(writer_durations[i]);
+ }
+ kfree(writer_tasks);
+ kfree(writer_durations);
+ kfree(writer_n_durations);
+ }
+
+ /* Do torture-type-specific cleanup operations. */
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+/*
+ * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
+ * down system.
+ */
+static int
+rcu_scale_shutdown(void *arg)
+{
+ wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
+ smp_mb(); /* Wake before output. */
+ rcu_scale_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
static int __init
rcu_scale_init(void)
{
long i;
int firsterr = 0;
static struct rcu_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS
+ &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
};
if (!torture_init_begin(scale_type, verbose))
@@ -905,6 +976,11 @@ rcu_scale_init(void)
if (cur_ops->init)
cur_ops->init();
+ if (cur_ops->rso_gp_kthread) {
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp)
+ kthread_stime = kthread_tp->stime;
+ }
if (kfree_rcu_test)
return kfree_scale_init();
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 147551c23baf..ade42d6a9d9b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1581,6 +1581,7 @@ rcu_torture_writer(void *arg)
rcu_access_pointer(rcu_torture_current) !=
&rcu_tortures[i]) {
tracing_off();
+ show_rcu_gp_kthreads();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1876,7 +1877,7 @@ static int
rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
{
int mask = rcutorture_extend_mask_max();
- unsigned long randmask1 = torture_random(trsp) >> 8;
+ unsigned long randmask1 = torture_random(trsp);
unsigned long randmask2 = randmask1 >> 3;
unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
@@ -1935,7 +1936,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
if (!((mask - 1) & mask))
return rtrsp; /* Current RCU reader not extendable. */
/* Bias towards larger numbers of loops. */
- i = (torture_random(trsp) >> 3);
+ i = torture_random(trsp);
i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1;
for (j = 0; j < i; j++) {
mask = rcutorture_extend_mask(*readstate, trsp);
@@ -2136,7 +2137,7 @@ static int rcu_nocb_toggle(void *arg)
toggle_fuzz = NSEC_PER_USEC;
do {
r = torture_random(&rand);
- cpu = (r >> 4) % (maxcpu + 1);
+ cpu = (r >> 1) % (maxcpu + 1);
if (r & 0x1) {
rcu_nocb_cpu_offload(cpu);
atomic_long_inc(&n_nocb_offload);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 1970ce5f22d4..91a0fd0d4d9a 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -528,6 +528,38 @@ static struct ref_scale_ops clock_ops = {
.name = "clock"
};
+static void ref_jiffies_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += jiffies;
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_jiffies_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += jiffies;
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static struct ref_scale_ops jiffies_ops = {
+ .readsection = ref_jiffies_section,
+ .delaysection = ref_jiffies_delay_section,
+ .name = "jiffies"
+};
+
////////////////////////////////////////////////////////////////////////
//
// Methods leveraging SLAB_TYPESAFE_BY_RCU.
@@ -1047,7 +1079,7 @@ ref_scale_init(void)
int firsterr = 0;
static struct ref_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
- &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
+ &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops,
&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
};
@@ -1107,12 +1139,11 @@ ref_scale_init(void)
VERBOSE_SCALEOUT("Starting %d reader threads", nreaders);
for (i = 0; i < nreaders; i++) {
+ init_waitqueue_head(&reader_tasks[i].wq);
firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
reader_tasks[i].task);
if (torture_init_error(firsterr))
goto unwind;
-
- init_waitqueue_head(&(reader_tasks[i].wq));
}
// Main Task
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 5f4fc8184dd0..8d65f7d576a3 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -25,6 +25,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @cblist: Callback list.
* @lock: Lock protecting per-CPU callback list.
* @rtp_jiffies: Jiffies counter value for statistics.
+ * @lazy_timer: Timer to unlazify callbacks.
+ * @urgent_gp: Number of additional non-lazy grace periods.
* @rtp_n_lock_retries: Rough lock-contention statistic.
* @rtp_work: Work queue for invoking callbacks.
* @rtp_irq_work: IRQ work queue for deferred wakeups.
@@ -38,6 +40,8 @@ struct rcu_tasks_percpu {
raw_spinlock_t __private lock;
unsigned long rtp_jiffies;
unsigned long rtp_n_lock_retries;
+ struct timer_list lazy_timer;
+ unsigned int urgent_gp;
struct work_struct rtp_work;
struct irq_work rtp_irq_work;
struct rcu_head barrier_q_head;
@@ -51,7 +55,6 @@ struct rcu_tasks_percpu {
* @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
* @cbs_gbl_lock: Lock protecting callback list.
* @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
- * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
* @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping.
@@ -61,6 +64,8 @@ struct rcu_tasks_percpu {
* @tasks_gp_seq: Number of grace periods completed since boot.
* @n_ipis: Number of IPIs sent to encourage grace periods to end.
* @n_ipis_fails: Number of IPI-send failures.
+ * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
+ * @lazy_jiffies: Number of jiffies to allow callbacks to be lazy.
* @pregp_func: This flavor's pre-grace-period function (optional).
* @pertask_func: This flavor's per-task scan function (optional).
* @postscan_func: This flavor's post-task scan function (optional).
@@ -92,6 +97,7 @@ struct rcu_tasks {
unsigned long n_ipis;
unsigned long n_ipis_fails;
struct task_struct *kthread_ptr;
+ unsigned long lazy_jiffies;
rcu_tasks_gp_func_t gp_func;
pregp_func_t pregp_func;
pertask_func_t pertask_func;
@@ -127,6 +133,7 @@ static struct rcu_tasks rt_name = \
.gp_func = gp, \
.call_func = call, \
.rtpcpu = &rt_name ## __percpu, \
+ .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \
.name = n, \
.percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \
.percpu_enqueue_lim = 1, \
@@ -139,9 +146,7 @@ static struct rcu_tasks rt_name = \
#ifdef CONFIG_TASKS_RCU
/* Track exiting tasks in order to allow them to be waited for. */
DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
-#endif
-#ifdef CONFIG_TASKS_RCU
/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
@@ -171,6 +176,8 @@ static int rcu_task_contend_lim __read_mostly = 100;
module_param(rcu_task_contend_lim, int, 0444);
static int rcu_task_collapse_lim __read_mostly = 10;
module_param(rcu_task_collapse_lim, int, 0444);
+static int rcu_task_lazy_lim __read_mostly = 32;
+module_param(rcu_task_lazy_lim, int, 0444);
/* RCU tasks grace-period state for debugging. */
#define RTGS_INIT 0
@@ -229,7 +236,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
#endif /* #ifndef CONFIG_TINY_RCU */
// Initialize per-CPU callback lists for the specified flavor of
-// Tasks RCU.
+// Tasks RCU. Do not enqueue callbacks before this function is invoked.
static void cblist_init_generic(struct rcu_tasks *rtp)
{
int cpu;
@@ -237,11 +244,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
int lim;
int shift;
- raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rcu_task_enqueue_lim < 0) {
rcu_task_enqueue_lim = 1;
rcu_task_cb_adjust = true;
- pr_info("%s: Setting adjustable number of callback queues.\n", __func__);
} else if (rcu_task_enqueue_lim == 0) {
rcu_task_enqueue_lim = 1;
}
@@ -261,18 +266,46 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
WARN_ON_ONCE(!rtpcp);
if (cpu)
raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
- raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ local_irq_save(flags); // serialize initialization
if (rcu_segcblist_empty(&rtpcp->cblist))
rcu_segcblist_init(&rtpcp->cblist);
+ local_irq_restore(flags);
INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
rtpcp->cpu = cpu;
rtpcp->rtpp = rtp;
if (!rtpcp->rtp_blkd_tasks.next)
INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
- raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
}
- raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
- pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim));
+
+ pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
+ data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust);
+}
+
+// Compute wakeup time for lazy callback timer.
+static unsigned long rcu_tasks_lazy_time(struct rcu_tasks *rtp)
+{
+ return jiffies + rtp->lazy_jiffies;
+}
+
+// Timer handler that unlazifies lazy callbacks.
+static void call_rcu_tasks_generic_timer(struct timer_list *tlp)
+{
+ unsigned long flags;
+ bool needwake = false;
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer);
+
+ rtp = rtpcp->rtpp;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (!rcu_segcblist_empty(&rtpcp->cblist) && rtp->lazy_jiffies) {
+ if (!rtpcp->urgent_gp)
+ rtpcp->urgent_gp = 1;
+ needwake = true;
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (needwake)
+ rcuwait_wake_up(&rtp->cbs_wait);
}
// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic().
@@ -291,6 +324,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
{
int chosen_cpu;
unsigned long flags;
+ bool havekthread = smp_load_acquire(&rtp->kthread_ptr);
int ideal_cpu;
unsigned long j;
bool needadjust = false;
@@ -315,12 +349,19 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids)
needadjust = true; // Defer adjustment to avoid deadlock.
}
- if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) {
- raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
- cblist_init_generic(rtp);
- raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ // Queuing callbacks before initialization not yet supported.
+ if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist)))
+ rcu_segcblist_init(&rtpcp->cblist);
+ needwake = (func == wakeme_after_rcu) ||
+ (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim);
+ if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) {
+ if (rtp->lazy_jiffies)
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ else
+ needwake = rcu_segcblist_empty(&rtpcp->cblist);
}
- needwake = rcu_segcblist_empty(&rtpcp->cblist);
+ if (needwake)
+ rtpcp->urgent_gp = 3;
rcu_segcblist_enqueue(&rtpcp->cblist, rhp);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
if (unlikely(needadjust)) {
@@ -414,9 +455,14 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
}
rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
(void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
- if (rcu_segcblist_pend_cbs(&rtpcp->cblist))
+ if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) {
+ if (rtp->lazy_jiffies)
+ rtpcp->urgent_gp--;
needgpcb |= 0x3;
- if (!rcu_segcblist_empty(&rtpcp->cblist))
+ } else if (rcu_segcblist_empty(&rtpcp->cblist)) {
+ rtpcp->urgent_gp = 0;
+ }
+ if (rcu_segcblist_ready_cbs(&rtpcp->cblist))
needgpcb |= 0x1;
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
}
@@ -463,6 +509,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
{
int cpu;
int cpunext;
+ int cpuwq;
unsigned long flags;
int len;
struct rcu_head *rhp;
@@ -473,11 +520,13 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
cpunext = cpu * 2 + 1;
if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
- queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work);
+ cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
cpunext++;
if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
- queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work);
+ cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
}
}
@@ -521,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
if (unlikely(midboot)) {
needgpcb = 0x2;
} else {
+ mutex_unlock(&rtp->tasks_gp_mutex);
set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
rcuwait_wait_event(&rtp->cbs_wait,
(needgpcb = rcu_tasks_need_gpcb(rtp)),
TASK_IDLE);
+ mutex_lock(&rtp->tasks_gp_mutex);
}
if (needgpcb & 0x2) {
@@ -545,11 +596,19 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
// RCU-tasks kthread that detects grace periods and invokes callbacks.
static int __noreturn rcu_tasks_kthread(void *arg)
{
+ int cpu;
struct rcu_tasks *rtp = arg;
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ timer_setup(&rtpcp->lazy_timer, call_rcu_tasks_generic_timer, 0);
+ rtpcp->urgent_gp = 1;
+ }
+
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
housekeeping_affine(current, HK_TYPE_RCU);
- WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
+ smp_store_release(&rtp->kthread_ptr, current); // Let GPs start!
/*
* Each pass through the following loop makes one check for
@@ -631,16 +690,22 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
int cpu;
bool havecbs = false;
+ bool haveurgent = false;
+ bool haveurgentcbs = false;
for_each_possible_cpu(cpu) {
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
- if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) {
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)))
havecbs = true;
+ if (data_race(rtpcp->urgent_gp))
+ haveurgent = true;
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)) && data_race(rtpcp->urgent_gp))
+ haveurgentcbs = true;
+ if (havecbs && haveurgent && haveurgentcbs)
break;
- }
}
- pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n",
+ pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c%c%c l:%lu %s\n",
rtp->kname,
tasks_gp_state_getname(rtp), data_race(rtp->gp_state),
jiffies - data_race(rtp->gp_jiffies),
@@ -648,6 +713,9 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis),
".k"[!!data_race(rtp->kthread_ptr)],
".C"[havecbs],
+ ".u"[haveurgent],
+ ".U"[haveurgentcbs],
+ rtp->lazy_jiffies,
s);
}
#endif // #ifndef CONFIG_TINY_RCU
@@ -1016,11 +1084,16 @@ void rcu_barrier_tasks(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
+int rcu_tasks_lazy_ms = -1;
+module_param(rcu_tasks_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_kthread(void)
{
cblist_init_generic(&rcu_tasks);
rcu_tasks.gp_sleep = HZ / 10;
rcu_tasks.init_fract = HZ / 10;
+ if (rcu_tasks_lazy_ms >= 0)
+ rcu_tasks.lazy_jiffies = msecs_to_jiffies(rcu_tasks_lazy_ms);
rcu_tasks.pregp_func = rcu_tasks_pregp_step;
rcu_tasks.pertask_func = rcu_tasks_pertask;
rcu_tasks.postscan_func = rcu_tasks_postscan;
@@ -1038,6 +1111,12 @@ void show_rcu_tasks_classic_gp_kthread(void)
EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
+struct task_struct *get_rcu_tasks_gp_kthread(void)
+{
+ return rcu_tasks.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
+
/*
* Contribute to protect against tasklist scan blind spot while the
* task is exiting and may be removed from the tasklist. See
@@ -1169,10 +1248,15 @@ void rcu_barrier_tasks_rude(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
+int rcu_tasks_rude_lazy_ms = -1;
+module_param(rcu_tasks_rude_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_rude_kthread(void)
{
cblist_init_generic(&rcu_tasks_rude);
rcu_tasks_rude.gp_sleep = HZ / 10;
+ if (rcu_tasks_rude_lazy_ms >= 0)
+ rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms);
rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
return 0;
}
@@ -1184,6 +1268,13 @@ void show_rcu_tasks_rude_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
+
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
+{
+ return rcu_tasks_rude.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread);
+
#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
////////////////////////////////////////////////////////////////////////
@@ -1789,6 +1880,9 @@ void rcu_barrier_tasks_trace(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace);
+int rcu_tasks_trace_lazy_ms = -1;
+module_param(rcu_tasks_trace_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_trace_kthread(void)
{
cblist_init_generic(&rcu_tasks_trace);
@@ -1803,6 +1897,8 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
if (rcu_tasks_trace.init_fract <= 0)
rcu_tasks_trace.init_fract = 1;
}
+ if (rcu_tasks_trace_lazy_ms >= 0)
+ rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms);
rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
@@ -1826,6 +1922,12 @@ void show_rcu_tasks_trace_gp_kthread(void)
EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
+struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
+{
+ return rcu_tasks_trace.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
+
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f52ff7241041..cb1caefa8bd0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -632,7 +632,7 @@ void __rcu_irq_enter_check_tick(void)
// prevents self-deadlock. So we can safely recheck under the lock.
// Note that the nohz_full state currently cannot change.
raw_spin_lock_rcu_node(rdp->mynode);
- if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
+ if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
// A nohz_full CPU is in the kernel and RCU needs a
// quiescent state. Turn on the tick!
WRITE_ONCE(rdp->rcu_forced_tick, true);
@@ -677,12 +677,16 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
}
/**
- * rcu_is_watching - see if RCU thinks that the current CPU is not idle
+ * rcu_is_watching - RCU read-side critical sections permitted on current CPU?
*
- * Return true if RCU is watching the running CPU, which means that this
- * CPU can safely enter RCU read-side critical sections. In other words,
- * if the current CPU is not in its idle loop or is in an interrupt or
- * NMI handler, return true.
+ * Return @true if RCU is watching the running CPU and @false otherwise.
+ * An @true return means that this CPU can safely enter RCU read-side
+ * critical sections.
+ *
+ * Although calls to rcu_is_watching() from most parts of the kernel
+ * will return @true, there are important exceptions. For example, if the
+ * current CPU is deep within its idle loop, in kernel entry/exit code,
+ * or offline, rcu_is_watching() will return @false.
*
* Make notrace because it can be called by the internal functions of
* ftrace, and making this notrace removes unnecessary recursion calls.
@@ -2046,19 +2050,35 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
rcu_report_qs_rdp(rdp);
}
+/* Return true if callback-invocation time limit exceeded. */
+static bool rcu_do_batch_check_time(long count, long tlimit,
+ bool jlimit_check, unsigned long jlimit)
+{
+ // Invoke local_clock() only once per 32 consecutive callbacks.
+ return unlikely(tlimit) &&
+ (!likely(count & 31) ||
+ (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) &&
+ jlimit_check && time_after(jiffies, jlimit))) &&
+ local_clock() >= tlimit;
+}
+
/*
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Throttle as specified by rdp->blimit.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
+ long bl;
+ long count = 0;
int div;
bool __maybe_unused empty;
unsigned long flags;
- struct rcu_head *rhp;
+ unsigned long jlimit;
+ bool jlimit_check = false;
+ long pending;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
- long bl, count = 0;
- long pending, tlimit = 0;
+ struct rcu_head *rhp;
+ long tlimit = 0;
/* If no callbacks are ready, just return. */
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
@@ -2082,11 +2102,15 @@ static void rcu_do_batch(struct rcu_data *rdp)
div = READ_ONCE(rcu_divisor);
div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
bl = max(rdp->blimit, pending >> div);
- if (in_serving_softirq() && unlikely(bl > 100)) {
+ if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) &&
+ (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) {
+ const long npj = NSEC_PER_SEC / HZ;
long rrn = READ_ONCE(rcu_resched_ns);
rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
tlimit = local_clock() + rrn;
+ jlimit = jiffies + (rrn + npj + 1) / npj;
+ jlimit_check = true;
}
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_cbs(&rdp->cblist), bl);
@@ -2126,21 +2150,23 @@ static void rcu_do_batch(struct rcu_data *rdp)
* Make sure we don't spend too much time here and deprive other
* softirq vectors of CPU cycles.
*/
- if (unlikely(tlimit)) {
- /* only call local_clock() every 32 callbacks */
- if (likely((count & 31) || local_clock() < tlimit))
- continue;
- /* Exceeded the time limit, so leave. */
+ if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit))
break;
- }
} else {
- // In rcuoc context, so no worries about depriving
- // other softirq vectors of CPU cycles.
+ // In rcuc/rcuoc context, so no worries about
+ // depriving other softirq vectors of CPU cycles.
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
lockdep_assert_irqs_enabled();
local_bh_disable();
+ // But rcuc kthreads can delay quiescent-state
+ // reporting, so check time limits for them.
+ if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING &&
+ rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) {
+ rdp->rcu_cpu_has_work = 1;
+ break;
+ }
}
}
@@ -2459,12 +2485,12 @@ static void rcu_cpu_kthread(unsigned int cpu)
*statusp = RCU_KTHREAD_RUNNING;
local_irq_disable();
work = *workp;
- *workp = 0;
+ WRITE_ONCE(*workp, 0);
local_irq_enable();
if (work)
rcu_core();
local_bh_enable();
- if (*workp == 0) {
+ if (!READ_ONCE(*workp)) {
trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
*statusp = RCU_KTHREAD_WAITING;
return;
@@ -2756,7 +2782,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
*/
struct kvfree_rcu_bulk_data {
struct list_head list;
- unsigned long gp_snap;
+ struct rcu_gp_oldstate gp_snap;
unsigned long nr_records;
void *records[];
};
@@ -2773,6 +2799,7 @@ struct kvfree_rcu_bulk_data {
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
* @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
@@ -2780,6 +2807,7 @@ struct kvfree_rcu_bulk_data {
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
+ struct rcu_gp_oldstate head_free_gp_snap;
struct list_head bulk_head_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
@@ -2900,6 +2928,9 @@ drain_page_cache(struct kfree_rcu_cpu *krcp)
struct llist_node *page_list, *pos, *n;
int freed = 0;
+ if (!rcu_min_cached_objs)
+ return 0;
+
raw_spin_lock_irqsave(&krcp->lock, flags);
page_list = llist_del_all(&krcp->bkvcache);
WRITE_ONCE(krcp->nr_bkv_objs, 0);
@@ -2920,24 +2951,25 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
unsigned long flags;
int i;
- debug_rcu_bhead_unqueue(bnode);
-
- rcu_lock_acquire(&rcu_callback_map);
- if (idx == 0) { // kmalloc() / kfree().
- trace_rcu_invoke_kfree_bulk_callback(
- rcu_state.name, bnode->nr_records,
- bnode->records);
-
- kfree_bulk(bnode->nr_records, bnode->records);
- } else { // vmalloc() / vfree().
- for (i = 0; i < bnode->nr_records; i++) {
- trace_rcu_invoke_kvfree_callback(
- rcu_state.name, bnode->records[i], 0);
-
- vfree(bnode->records[i]);
+ if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
+ debug_rcu_bhead_unqueue(bnode);
+ rcu_lock_acquire(&rcu_callback_map);
+ if (idx == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ rcu_state.name, bnode->nr_records,
+ bnode->records);
+
+ kfree_bulk(bnode->nr_records, bnode->records);
+ } else { // vmalloc() / vfree().
+ for (i = 0; i < bnode->nr_records; i++) {
+ trace_rcu_invoke_kvfree_callback(
+ rcu_state.name, bnode->records[i], 0);
+
+ vfree(bnode->records[i]);
+ }
}
+ rcu_lock_release(&rcu_callback_map);
}
- rcu_lock_release(&rcu_callback_map);
raw_spin_lock_irqsave(&krcp->lock, flags);
if (put_cached_bnode(krcp, bnode))
@@ -2984,6 +3016,7 @@ static void kfree_rcu_work(struct work_struct *work)
struct rcu_head *head;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
+ struct rcu_gp_oldstate head_gp_snap;
int i;
krwp = container_of(to_rcu_work(work),
@@ -2998,6 +3031,7 @@ static void kfree_rcu_work(struct work_struct *work)
// Channel 3.
head = krwp->head_free;
krwp->head_free = NULL;
+ head_gp_snap = krwp->head_free_gp_snap;
raw_spin_unlock_irqrestore(&krcp->lock, flags);
// Handle the first two channels.
@@ -3014,7 +3048,8 @@ static void kfree_rcu_work(struct work_struct *work)
* queued on a linked list through their rcu_head structures.
* This list is named "Channel 3".
*/
- kvfree_rcu_list(head);
+ if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
+ kvfree_rcu_list(head);
}
static bool
@@ -3081,7 +3116,7 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
INIT_LIST_HEAD(&bulk_ready[i]);
list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
- if (!poll_state_synchronize_rcu(bnode->gp_snap))
+ if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
break;
atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
@@ -3146,6 +3181,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
// objects queued on the linked list.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
+ get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
atomic_set(&krcp->head_count, 0);
WRITE_ONCE(krcp->head, NULL);
}
@@ -3194,7 +3230,7 @@ static void fill_page_cache_func(struct work_struct *work)
nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
1 : rcu_min_cached_objs;
- for (i = 0; i < nr_pages; i++) {
+ for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -3218,6 +3254,10 @@ static void fill_page_cache_func(struct work_struct *work)
static void
run_page_cache_worker(struct kfree_rcu_cpu *krcp)
{
+ // If cache disabled, bail out.
+ if (!rcu_min_cached_objs)
+ return;
+
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
if (atomic_read(&krcp->backoff_page_cache_fill)) {
@@ -3272,7 +3312,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
// scenarios.
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
- *krcp = krc_this_cpu_lock(flags);
+ raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
}
if (!bnode)
@@ -3285,7 +3325,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
// Finally insert and update the GP for this page.
bnode->records[bnode->nr_records++] = ptr;
- bnode->gp_snap = get_state_synchronize_rcu();
+ get_state_synchronize_rcu_full(&bnode->gp_snap);
atomic_inc(&(*krcp)->bulk_count[idx]);
return true;
@@ -4283,7 +4323,6 @@ int rcutree_prepare_cpu(unsigned int cpu)
*/
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rdp->beenonline = true; /* We have now been online. */
rdp->gp_seq = READ_ONCE(rnp->gp_seq);
rdp->gp_seq_needed = rdp->gp_seq;
rdp->cpu_no_qs.b.norm = true;
@@ -4311,6 +4350,16 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
}
/*
+ * Has the specified (known valid) CPU ever been fully online?
+ */
+bool rcu_cpu_beenfullyonline(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return smp_load_acquire(&rdp->beenonline);
+}
+
+/*
* Near the end of the CPU-online process. Pretty much all services
* enabled, and the CPU is now very much alive.
*/
@@ -4368,15 +4417,16 @@ int rcutree_offline_cpu(unsigned int cpu)
* Note that this function is special in that it is invoked directly
* from the incoming CPU rather than from the cpuhp_step mechanism.
* This is because this function must be invoked at a precise location.
+ * This incoming CPU must not have enabled interrupts yet.
*/
void rcu_cpu_starting(unsigned int cpu)
{
- unsigned long flags;
unsigned long mask;
struct rcu_data *rdp;
struct rcu_node *rnp;
bool newcpu;
+ lockdep_assert_irqs_disabled();
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->cpu_started)
return;
@@ -4384,7 +4434,6 @@ void rcu_cpu_starting(unsigned int cpu)
rnp = rdp->mynode;
mask = rdp->grpmask;
- local_irq_save(flags);
arch_spin_lock(&rcu_state.ofl_lock);
rcu_dynticks_eqs_online();
raw_spin_lock(&rcu_state.barrier_lock);
@@ -4403,17 +4452,17 @@ void rcu_cpu_starting(unsigned int cpu)
/* An incoming CPU should never be blocking a grace period. */
if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
/* rcu_report_qs_rnp() *really* wants some flags to restore */
- unsigned long flags2;
+ unsigned long flags;
- local_irq_save(flags2);
+ local_irq_save(flags);
rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
- rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2);
+ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
} else {
raw_spin_unlock_rcu_node(rnp);
}
arch_spin_unlock(&rcu_state.ofl_lock);
- local_irq_restore(flags);
+ smp_store_release(&rdp->beenonline, true);
smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 3b7abb58157d..8239b39d945b 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -643,7 +643,7 @@ static void synchronize_rcu_expedited_wait(void)
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)],
"N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
- "D."[!!(rdp->cpu_no_qs.b.exp)]);
+ "D."[!!data_race(rdp->cpu_no_qs.b.exp)]);
}
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index f2280616f9d5..5598212d1f27 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -77,9 +77,9 @@ __setup("rcu_nocbs", rcu_nocb_setup);
static int __init parse_rcu_nocb_poll(char *arg)
{
rcu_nocb_poll = true;
- return 0;
+ return 1;
}
-early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+__setup("rcu_nocb_poll", parse_rcu_nocb_poll);
/*
* Don't bother bypassing ->cblist if the call_rcu() rate is low.
@@ -1319,13 +1319,22 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
int cpu;
unsigned long count = 0;
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+
+ /* Protect rcu_nocb_mask against concurrent (de-)offloading. */
+ if (!mutex_trylock(&rcu_state.barrier_mutex))
+ return 0;
+
/* Snapshot count of all CPUs */
- for_each_possible_cpu(cpu) {
+ for_each_cpu(cpu, rcu_nocb_mask) {
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
count += READ_ONCE(rdp->lazy_len);
}
+ mutex_unlock(&rcu_state.barrier_mutex);
+
return count ? count : SHRINK_EMPTY;
}
@@ -1336,15 +1345,45 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
unsigned long flags;
unsigned long count = 0;
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+ /*
+ * Protect against concurrent (de-)offloading. Otherwise nocb locking
+ * may be ignored or imbalanced.
+ */
+ if (!mutex_trylock(&rcu_state.barrier_mutex)) {
+ /*
+ * But really don't insist if barrier_mutex is contended since we
+ * can't guarantee that it will never engage in a dependency
+ * chain involving memory allocation. The lock is seldom contended
+ * anyway.
+ */
+ return 0;
+ }
+
/* Snapshot count of all CPUs */
- for_each_possible_cpu(cpu) {
+ for_each_cpu(cpu, rcu_nocb_mask) {
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int _count = READ_ONCE(rdp->lazy_len);
+ int _count;
+
+ if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)))
+ continue;
- if (_count == 0)
+ if (!READ_ONCE(rdp->lazy_len))
continue;
+
rcu_nocb_lock_irqsave(rdp, flags);
- WRITE_ONCE(rdp->lazy_len, 0);
+ /*
+ * Recheck under the nocb lock. Since we are not holding the bypass
+ * lock we may still race with increments from the enqueuer but still
+ * we know for sure if there is at least one lazy callback.
+ */
+ _count = READ_ONCE(rdp->lazy_len);
+ if (!_count) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue;
+ }
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
rcu_nocb_unlock_irqrestore(rdp, flags);
wake_nocb_gp(rdp, false);
sc->nr_to_scan -= _count;
@@ -1352,6 +1391,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
if (sc->nr_to_scan <= 0)
break;
}
+
+ mutex_unlock(&rcu_state.barrier_mutex);
+
return count ? count : SHRINK_STOP;
}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7b0fe741a088..41021080ad25 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -257,6 +257,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* GP should not be able to end until we report, so there should be
* no need to check for a subsequent expedited GP. (Though we are
* still in a quiescent state in any case.)
+ *
+ * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change.
*/
if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
@@ -941,7 +943,7 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- if (rdp->cpu_no_qs.b.exp)
+ if (READ_ONCE(rdp->cpu_no_qs.b.exp))
rcu_report_exp_rdp(rdp);
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b10b8349bb2a..6f06dc12904a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -1035,7 +1035,7 @@ static bool sysrq_rcu;
module_param(sysrq_rcu, bool, 0444);
/* Dump grace-period-request information due to commandeered sysrq. */
-static void sysrq_show_rcu(int key)
+static void sysrq_show_rcu(u8 key)
{
show_rcu_gp_kthreads();
}