aboutsummaryrefslogtreecommitdiff
path: root/kernel/rcu
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu')
-rw-r--r--kernel/rcu/Kconfig28
-rw-r--r--kernel/rcu/rcu_segcblist.h1
-rw-r--r--kernel/rcu/rcuscale.c8
-rw-r--r--kernel/rcu/rcutorture.c92
-rw-r--r--kernel/rcu/refscale.c56
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/srcutree.c133
-rw-r--r--kernel/rcu/tasks.h20
-rw-r--r--kernel/rcu/tree.c24
-rw-r--r--kernel/rcu/tree_nocb.h13
-rw-r--r--kernel/rcu/tree_plugin.h22
-rw-r--r--kernel/rcu/tree_stall.h57
12 files changed, 283 insertions, 173 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 3e079de0f5b4..b9b6bc55185d 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -249,16 +249,24 @@ config RCU_NOCB_CPU
workloads will incur significant increases in context-switch
rates.
- This option offloads callback invocation from the set of CPUs
- specified at boot time by the rcu_nocbs parameter. For each
- such CPU, a kthread ("rcuox/N") will be created to invoke
- callbacks, where the "N" is the CPU being offloaded, and where
- the "x" is "p" for RCU-preempt (PREEMPTION kernels) and "s" for
- RCU-sched (!PREEMPTION kernels). Nothing prevents this kthread
- from running on the specified CPUs, but (1) the kthreads may be
- preempted between each callback, and (2) affinity or cgroups can
- be used to force the kthreads to run on whatever set of CPUs is
- desired.
+ This option offloads callback invocation from the set of
+ CPUs specified at boot time by the rcu_nocbs parameter.
+ For each such CPU, a kthread ("rcuox/N") will be created to
+ invoke callbacks, where the "N" is the CPU being offloaded,
+ and where the "x" is "p" for RCU-preempt (PREEMPTION kernels)
+ and "s" for RCU-sched (!PREEMPTION kernels). This option
+ also creates another kthread for each sqrt(nr_cpu_ids) CPUs
+ ("rcuog/N", where N is the first CPU in that group to come
+ online), which handles grace periods for its group. Nothing
+ prevents these kthreads from running on the specified CPUs,
+ but (1) the kthreads may be preempted between each callback,
+ and (2) affinity or cgroups can be used to force the kthreads
+ to run on whatever set of CPUs is desired.
+
+ The sqrt(nr_cpu_ids) grouping may be overridden using the
+ rcutree.rcu_nocb_gp_stride kernel boot parameter. This can
+ be especially helpful for smaller numbers of CPUs, where
+ sqrt(nr_cpu_ids) can be a bit of a blunt instrument.
Say Y here if you need reduced OS jitter, despite added overhead.
Say N here if you are unsure.
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 259904075636..fadc08ad4b7b 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -120,7 +120,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v);
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 6d37596deb1f..0f3059b1b80d 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -889,14 +889,14 @@ kfree_scale_init(void)
if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
- WARN_ON_ONCE(1);
- return -1;
+ firsterr = -1;
+ goto unwind;
}
if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) {
pr_alert("ERROR: call_rcu() CBs are being too lazy!\n");
- WARN_ON_ONCE(1);
- return -1;
+ firsterr = -1;
+ goto unwind;
}
}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bb75dbf5c800..612d27690335 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -57,9 +57,9 @@ MODULE_AUTHOR("Paul E. McKenney <[email protected]> and Josh Triplett <josh@
/* Bits for ->extendables field, extendables param, and related definitions. */
#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1)
-#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2)
+#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2)
#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */
#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */
#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */
@@ -71,6 +71,9 @@ MODULE_AUTHOR("Paul E. McKenney <[email protected]> and Josh Triplett <josh@
#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
+#define RCUTORTURE_RDR_ALLBITS \
+ (RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \
+ RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2)
#define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */
/* Must be power of two minus one. */
#define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3)
@@ -108,6 +111,7 @@ torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disab
torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)");
torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable");
+torture_param(int, reader_flavor, 0x1, "Reader flavors to use, one per bit.");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
@@ -393,6 +397,7 @@ struct rcu_torture_ops {
int slow_gps;
int no_pi_lock;
int debug_objects;
+ int start_poll_irqsoff;
const char *name;
};
@@ -581,6 +586,7 @@ static struct rcu_torture_ops rcu_ops = {
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
.debug_objects = 1,
+ .start_poll_irqsoff = 1,
.name = "rcu"
};
@@ -641,10 +647,25 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
static int srcu_torture_read_lock(void)
{
- if (cur_ops == &srcud_ops)
- return srcu_read_lock_nmisafe(srcu_ctlp);
- else
- return srcu_read_lock(srcu_ctlp);
+ int idx;
+ int ret = 0;
+
+ if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) {
+ idx = srcu_read_lock(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx;
+ }
+ if (reader_flavor & 0x2) {
+ idx = srcu_read_lock_nmisafe(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 1;
+ }
+ if (reader_flavor & 0x4) {
+ idx = srcu_read_lock_lite(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 2;
+ }
+ return ret;
}
static void
@@ -668,10 +689,13 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
static void srcu_torture_read_unlock(int idx)
{
- if (cur_ops == &srcud_ops)
- srcu_read_unlock_nmisafe(srcu_ctlp, idx);
- else
- srcu_read_unlock(srcu_ctlp, idx);
+ WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
+ if (reader_flavor & 0x4)
+ srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2);
+ if (reader_flavor & 0x2)
+ srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
+ if ((reader_flavor & 0x1) || !(reader_flavor & 0x7))
+ srcu_read_unlock(srcu_ctlp, idx & 0x1);
}
static int torture_srcu_read_lock_held(void)
@@ -1059,8 +1083,13 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *star
// At most one persisted message per boost test.
j = jiffies;
lp = READ_ONCE(last_persist);
- if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp)
- pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+ if (time_after(j, lp + mininterval) &&
+ cmpxchg(&last_persist, lp, j) == lp) {
+ if (cpu < 0)
+ pr_info("Boost inversion persisted: QS from all CPUs\n");
+ else
+ pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+ }
return false; // passed on a technicality
}
VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
@@ -1695,14 +1724,22 @@ rcu_torture_fakewriter(void *arg)
cur_ops->cond_sync_exp_full(&gp_snap_full);
break;
case RTWS_POLL_GET:
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_disable();
gp_snap = cur_ops->start_gp_poll();
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_enable();
while (!cur_ops->poll_gp_state(gp_snap)) {
torture_hrtimeout_jiffies(torture_random(&rand) % 16,
&rand);
}
break;
case RTWS_POLL_GET_FULL:
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_disable();
cur_ops->start_gp_poll_full(&gp_snap_full);
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_enable();
while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
torture_hrtimeout_jiffies(torture_random(&rand) % 16,
&rand);
@@ -1820,7 +1857,7 @@ static void rcutorture_one_extend(int *readstate, int newstate,
int statesold = *readstate & ~newstate;
WARN_ON_ONCE(idxold2 < 0);
- WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
+ WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
@@ -1835,9 +1872,9 @@ static void rcutorture_one_extend(int *readstate, int newstate,
if (statesnew & RCUTORTURE_RDR_SCHED)
rcu_read_lock_sched();
if (statesnew & RCUTORTURE_RDR_RCU_1)
- idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1;
+ idxnew1 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1;
if (statesnew & RCUTORTURE_RDR_RCU_2)
- idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2;
+ idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2;
/*
* Next, remove old protection, in decreasing order of strength
@@ -1857,7 +1894,7 @@ static void rcutorture_one_extend(int *readstate, int newstate,
if (statesold & RCUTORTURE_RDR_RBH)
rcu_read_unlock_bh();
if (statesold & RCUTORTURE_RDR_RCU_2) {
- cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1);
+ cur_ops->readunlock((idxold2 & RCUTORTURE_RDR_MASK_2) >> RCUTORTURE_RDR_SHIFT_2);
WARN_ON_ONCE(idxnew2 != -1);
idxold2 = 0;
}
@@ -1867,7 +1904,7 @@ static void rcutorture_one_extend(int *readstate, int newstate,
lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff);
if (lockit)
raw_spin_lock_irqsave(&current->pi_lock, flags);
- cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1);
+ cur_ops->readunlock((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1);
WARN_ON_ONCE(idxnew1 != -1);
idxold1 = 0;
if (lockit)
@@ -1882,16 +1919,13 @@ static void rcutorture_one_extend(int *readstate, int newstate,
if (idxnew1 == -1)
idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1;
WARN_ON_ONCE(idxnew1 < 0);
- if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1))
- pr_info("Unexpected idxnew1 value of %#x\n", idxnew1);
if (idxnew2 == -1)
idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2;
WARN_ON_ONCE(idxnew2 < 0);
- WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
*readstate = idxnew1 | idxnew2 | newstate;
WARN_ON_ONCE(*readstate < 0);
- if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1))
- pr_info("Unexpected idxnew2 value of %#x\n", idxnew2);
+ if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS))
+ pr_info("Unexpected readstate value of %#x\n", *readstate);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@ -1916,7 +1950,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1);
+ WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); // Can't have reader idx bits.
/* Mostly only one bit (need preemption!), sometimes lots of bits. */
if (!(randmask1 & 0x7))
mask = mask & randmask2;
@@ -2389,6 +2423,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d "
"read_exit_delay=%d read_exit_burst=%d "
+ "reader_flavor=%x "
"nocbs_nthreads=%d nocbs_toggle=%d "
"test_nmis=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
@@ -2401,6 +2436,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
n_barrier_cbs,
onoff_interval, onoff_holdoff,
read_exit_delay, read_exit_burst,
+ reader_flavor,
nocbs_nthreads, nocbs_toggle,
test_nmis);
}
@@ -2440,6 +2476,14 @@ static int rcutorture_booster_init(unsigned int cpu)
WARN_ON_ONCE(!t);
sp.sched_priority = 2;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#ifdef CONFIG_IRQ_FORCED_THREADING
+ if (force_irqthreads()) {
+ t = per_cpu(ktimerd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+#endif
}
/* Don't allow time recalculation while creating a new task. */
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 0db9db73f57f..aacfcc9838b3 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -75,6 +75,9 @@ MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s");
+// Number of seconds to extend warm-up and cool-down for multiple guest OSes
+torture_param(long, guest_os_delay, 0,
+ "Number of seconds to extend warm-up/cool-down for multiple guest OSes.");
// Wait until there are multiple CPUs before starting test.
torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
"Holdoff time before test start (s)");
@@ -212,6 +215,36 @@ static const struct ref_scale_ops srcu_ops = {
.name = "srcu"
};
+static void srcu_lite_ref_scale_read_section(const int nloops)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock_lite(srcu_ctlp);
+ srcu_read_unlock_lite(srcu_ctlp, idx);
+ }
+}
+
+static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock_lite(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock_lite(srcu_ctlp, idx);
+ }
+}
+
+static const struct ref_scale_ops srcu_lite_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = srcu_lite_ref_scale_read_section,
+ .delaysection = srcu_lite_ref_scale_delay_section,
+ .name = "srcu-lite"
+};
+
#ifdef CONFIG_TASKS_RCU
// Definitions for RCU Tasks ref scale testing: Empty read markers.
@@ -801,6 +834,18 @@ static void rcu_scale_one_reader(void)
cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000);
}
+// Warm up cache, or, if needed run a series of rcu_scale_one_reader()
+// to allow multiple rcuscale guest OSes to collect mutually valid data.
+static void rcu_scale_warm_cool(void)
+{
+ unsigned long jdone = jiffies + (guest_os_delay > 0 ? guest_os_delay * HZ : -1);
+
+ do {
+ rcu_scale_one_reader();
+ cond_resched();
+ } while (time_before(jiffies, jdone));
+}
+
// Reader kthread. Repeatedly does empty RCU read-side
// critical section, minimizing update-side interference.
static int
@@ -829,7 +874,7 @@ repeat:
goto end;
// Make sure that the CPU is affinitized appropriately during testing.
- WARN_ON_ONCE(raw_smp_processor_id() != me);
+ WARN_ON_ONCE(raw_smp_processor_id() != me % nr_cpu_ids);
WRITE_ONCE(rt->start_reader, 0);
if (!atomic_dec_return(&n_started))
@@ -957,6 +1002,7 @@ static int main_func(void *arg)
schedule_timeout_uninterruptible(1);
// Start exp readers up per experiment
+ rcu_scale_warm_cool();
for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
if (torture_must_stop())
goto end;
@@ -987,6 +1033,7 @@ static int main_func(void *arg)
result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops);
}
+ rcu_scale_warm_cool();
// Print the average of all experiments
SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
@@ -1082,9 +1129,10 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static const struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
- &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops,
- &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
+ &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS
+ &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops,
+ &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops,
+ &typesafe_seqlock_ops,
};
if (!torture_init_begin(scale_type, verbose))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 549c03336ee9..4dcbf8aa80ff 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -122,8 +122,8 @@ void srcu_drive_gp(struct work_struct *wp)
ssp = container_of(wp, struct srcu_struct, srcu_work);
preempt_disable(); // Needed for PREEMPT_AUTO
if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) {
- return; /* Already running or nothing to do. */
preempt_enable();
+ return; /* Already running or nothing to do. */
}
/* Remove recently arrived callbacks and wait for readers. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 31706e3293bc..5e2e53464794 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -128,7 +128,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
* Initialize the per-CPU srcu_data array, which feeds into the
* leaves of the srcu_node tree.
*/
- WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+ BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) !=
ARRAY_SIZE(sdp->srcu_unlock_count));
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
@@ -187,7 +187,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
/* Each pass through this loop initializes one srcu_node structure. */
srcu_for_each_node_breadth_first(ssp, snp) {
spin_lock_init(&ACCESS_PRIVATE(snp, lock));
- WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
+ BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ;
@@ -419,41 +419,60 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
}
/*
- * Returns approximate total of the readers' ->srcu_lock_count[] values
- * for the rank of per-CPU counters specified by idx.
+ * Is the current or any upcoming grace period to be expedited?
*/
-static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
+static bool srcu_gp_is_expedited(struct srcu_struct *ssp)
+{
+ struct srcu_usage *sup = ssp->srcu_sup;
+
+ return ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp));
+}
+
+/*
+ * Computes approximate total of the readers' ->srcu_lock_count[] values
+ * for the rank of per-CPU counters specified by idx, and returns true if
+ * the caller did the proper barrier (gp), and if the count of the locks
+ * matches that of the unlocks passed in.
+ */
+static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks)
{
int cpu;
+ unsigned long mask = 0;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&cpuc->srcu_lock_count[idx]);
+ sum += atomic_long_read(&sdp->srcu_lock_count[idx]);
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
}
- return sum;
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
+ "Mixed reader flavors for srcu_struct at %ps.\n", ssp);
+ if (mask & SRCU_READ_FLAVOR_LITE && !gp)
+ return false;
+ return sum == unlocks;
}
/*
* Returns approximate total of the readers' ->srcu_unlock_count[] values
* for the rank of per-CPU counters specified by idx.
*/
-static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm)
{
int cpu;
unsigned long mask = 0;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]);
- if (IS_ENABLED(CONFIG_PROVE_RCU))
- mask = mask | READ_ONCE(cpuc->srcu_nmi_safety);
+ sum += atomic_long_read(&sdp->srcu_unlock_count[idx]);
+ mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
}
- WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)),
- "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
+ "Mixed reader flavors for srcu_struct at %ps.\n", ssp);
+ *rdm = mask;
return sum;
}
@@ -463,22 +482,28 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
*/
static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
{
+ bool did_gp;
+ unsigned long rdm;
unsigned long unlocks;
- unlocks = srcu_readers_unlock_idx(ssp, idx);
+ unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm);
+ did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE);
/*
* Make sure that a lock is always counted if the corresponding
* unlock is counted. Needs to be a smp_mb() as the read side may
* contain a read from a variable that is written to before the
* synchronize_srcu() in the write side. In this case smp_mb()s
- * A and B act like the store buffering pattern.
+ * A and B (or X and Y) act like the store buffering pattern.
*
- * This smp_mb() also pairs with smp_mb() C to prevent accesses
- * after the synchronize_srcu() from being executed before the
- * grace period ends.
+ * This smp_mb() also pairs with smp_mb() C (or, in the case of X,
+ * Z) to prevent accesses after the synchronize_srcu() from being
+ * executed before the grace period ends.
*/
- smp_mb(); /* A */
+ if (!did_gp)
+ smp_mb(); /* A */
+ else
+ synchronize_rcu(); /* X */
/*
* If the locks are the same as the unlocks, then there must have
@@ -536,7 +561,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
* which are unlikely to be configured with an address space fully
* populated with memory, at least not anytime soon.
*/
- return srcu_readers_lock_idx(ssp, idx) == unlocks;
+ return srcu_readers_lock_idx(ssp, idx, did_gp, unlocks);
}
/**
@@ -554,12 +579,12 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&cpuc->srcu_lock_count[0]);
- sum += atomic_long_read(&cpuc->srcu_lock_count[1]);
- sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]);
- sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]);
+ sum += atomic_long_read(&sdp->srcu_lock_count[0]);
+ sum += atomic_long_read(&sdp->srcu_lock_count[1]);
+ sum -= atomic_long_read(&sdp->srcu_unlock_count[0]);
+ sum -= atomic_long_read(&sdp->srcu_unlock_count[1]);
}
return sum;
}
@@ -622,7 +647,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
unsigned long jbase = SRCU_INTERVAL;
struct srcu_usage *sup = ssp->srcu_sup;
- if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+ if (srcu_gp_is_expedited(ssp))
jbase = 0;
if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
j = jiffies - 1;
@@ -687,28 +712,28 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-#ifdef CONFIG_PROVE_RCU
/*
- * Check for consistent NMI safety.
+ * Check for consistent reader flavor.
*/
-void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe)
+void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
{
- int nmi_safe_mask = 1 << nmi_safe;
- int old_nmi_safe_mask;
+ int old_read_flavor;
struct srcu_data *sdp;
- /* NMI-unsafe use in NMI is a bad sign */
- WARN_ON_ONCE(!nmi_safe && in_nmi());
+ /* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */
+ WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi());
+ WARN_ON_ONCE(read_flavor & (read_flavor - 1));
+
sdp = raw_cpu_ptr(ssp->sda);
- old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety);
- if (!old_nmi_safe_mask) {
- WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask);
- return;
+ old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor);
+ if (!old_read_flavor) {
+ old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor);
+ if (!old_read_flavor)
+ return;
}
- WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask);
+ WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor);
}
-EXPORT_SYMBOL_GPL(srcu_check_nmi_safety);
-#endif /* CONFIG_PROVE_RCU */
+EXPORT_SYMBOL_GPL(__srcu_check_read_flavor);
/*
* Counts the new reader in the appropriate per-CPU element of the
@@ -867,7 +892,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
spin_lock_irq_rcu_node(sup);
idx = rcu_seq_state(sup->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+ if (srcu_gp_is_expedited(ssp))
cbdelay = 0;
WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
@@ -1122,6 +1147,8 @@ static void srcu_flip(struct srcu_struct *ssp)
* it stays until either (1) Compilers learn about this sort of
* control dependency or (2) Some production workload running on
* a production system is unduly delayed by this slowpath smp_mb().
+ * Except for _lite() readers, where it is inoperative, which
+ * means that it is a good thing that it is redundant.
*/
smp_mb(); /* E */ /* Pairs with B and C. */
@@ -1139,7 +1166,9 @@ static void srcu_flip(struct srcu_struct *ssp)
}
/*
- * If SRCU is likely idle, return true, otherwise return false.
+ * If SRCU is likely idle, in other words, the next SRCU grace period
+ * should be expedited, return true, otherwise return false. Except that
+ * in the presence of _lite() readers, always return false.
*
* Note that it is OK for several current from-idle requests for a new
* grace period from idle to specify expediting because they will all end
@@ -1159,7 +1188,7 @@ static void srcu_flip(struct srcu_struct *ssp)
* negligible when amortized over that time period, and the extra latency
* of a needlessly non-expedited grace period is similarly negligible.
*/
-static bool srcu_might_be_idle(struct srcu_struct *ssp)
+static bool srcu_should_expedite(struct srcu_struct *ssp)
{
unsigned long curseq;
unsigned long flags;
@@ -1168,6 +1197,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
unsigned long tlast;
check_init_srcu_struct(ssp);
+ /* If _lite() readers, don't do unsolicited expediting. */
+ if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)
+ return false;
/* If the local srcu_data structure has callbacks, not idle. */
sdp = raw_cpu_ptr(ssp->sda);
spin_lock_irqsave_rcu_node(sdp, flags);
@@ -1469,14 +1501,15 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
* Implementation of these memory-ordering guarantees is similar to
* that of synchronize_rcu().
*
- * If SRCU is likely idle, expedite the first request. This semantic
- * was provided by Classic SRCU, and is relied upon by its users, so TREE
- * SRCU must also provide it. Note that detecting idleness is heuristic
- * and subject to both false positives and negatives.
+ * If SRCU is likely idle as determined by srcu_should_expedite(),
+ * expedite the first request. This semantic was provided by Classic SRCU,
+ * and is relied upon by its users, so TREE SRCU must also provide it.
+ * Note that detecting idleness is heuristic and subject to both false
+ * positives and negatives.
*/
void synchronize_srcu(struct srcu_struct *ssp)
{
- if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited())
+ if (srcu_should_expedite(ssp) || rcu_gp_is_expedited())
synchronize_srcu_expedited(ssp);
else
__synchronize_srcu(ssp, true);
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 4d7ee95df06e..59314da5eb60 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1407,7 +1407,8 @@ static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func)
*/
void synchronize_rcu_tasks_rude(void)
{
- synchronize_rcu_tasks_generic(&rcu_tasks_rude);
+ if (!IS_ENABLED(CONFIG_ARCH_WANTS_NO_INSTR) || IS_ENABLED(CONFIG_FORCE_TASKS_RUDE_RCU))
+ synchronize_rcu_tasks_generic(&rcu_tasks_rude);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);
@@ -1549,22 +1550,7 @@ static void rcu_st_need_qs(struct task_struct *t, u8 v)
*/
u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
{
- union rcu_special ret;
- union rcu_special trs_old = READ_ONCE(t->trc_reader_special);
- union rcu_special trs_new = trs_old;
-
- if (trs_old.b.need_qs != old)
- return trs_old.b.need_qs;
- trs_new.b.need_qs = new;
-
- // Although cmpxchg() appears to KCSAN to update all four bytes,
- // only the .b.need_qs byte actually changes.
- instrument_atomic_read_write(&t->trc_reader_special.b.need_qs,
- sizeof(t->trc_reader_special.b.need_qs));
- // Avoid false-positive KCSAN failures.
- ret.s = data_race(cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s));
-
- return ret.b.need_qs;
+ return cmpxchg(&t->trc_reader_special.b.need_qs, old, new);
}
EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b1f883fcd918..ff98233d4aa5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3511,7 +3511,7 @@ static int krc_count(struct kfree_rcu_cpu *krcp)
}
static void
-schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
{
long delay, delay_left;
@@ -3526,6 +3526,16 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
}
static void
+schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ __schedule_delayed_monitor_work(krcp);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+static void
kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
{
struct list_head bulk_ready[FREE_N_CHANNELS];
@@ -3836,7 +3846,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
- schedule_delayed_monitor_work(krcp);
+ __schedule_delayed_monitor_work(krcp);
unlock_return:
krc_this_cpu_unlock(krcp, flags);
@@ -4194,7 +4204,6 @@ static void start_poll_synchronize_rcu_common(void)
struct rcu_data *rdp;
struct rcu_node *rnp;
- lockdep_assert_irqs_enabled();
local_irq_save(flags);
rdp = this_cpu_ptr(&rcu_data);
rnp = rdp->mynode;
@@ -4219,9 +4228,6 @@ static void start_poll_synchronize_rcu_common(void)
* grace period has elapsed in the meantime. If the needed grace period
* is not already slated to start, notifies RCU core of the need for that
* grace period.
- *
- * Interrupts must be enabled for the case where it is necessary to awaken
- * the grace-period kthread.
*/
unsigned long start_poll_synchronize_rcu(void)
{
@@ -4242,9 +4248,6 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
* grace period (whether normal or expedited) has elapsed in the meantime.
* If the needed grace period is not already slated to start, notifies
* RCU core of the need for that grace period.
- *
- * Interrupts must be enabled for the case where it is necessary to awaken
- * the grace-period kthread.
*/
void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
{
@@ -5580,8 +5583,7 @@ void rcu_init_geometry(void)
* Complain and fall back to the compile-time values if this
* limit is exceeded.
*/
- if (rcu_fanout_leaf < 2 ||
- rcu_fanout_leaf > sizeof(unsigned long) * 8) {
+ if (rcu_fanout_leaf < 2 || rcu_fanout_leaf > BITS_PER_LONG) {
rcu_fanout_leaf = RCU_FANOUT_LEAF;
WARN_ON(1);
return;
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 16865475120b..2605dd234a13 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -891,7 +891,18 @@ static void nocb_cb_wait(struct rcu_data *rdp)
swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
nocb_cb_wait_cond(rdp));
if (kthread_should_park()) {
- kthread_parkme();
+ /*
+ * kthread_park() must be preceded by an rcu_barrier().
+ * But yet another rcu_barrier() might have sneaked in between
+ * the barrier callback execution and the callbacks counter
+ * decrement.
+ */
+ if (rdp->nocb_cb_sleep) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ kthread_parkme();
+ }
} else if (READ_ONCE(rdp->nocb_cb_sleep)) {
WARN_ON(signal_pending(current));
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1c7cbd145d5e..3927ea5f7955 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -183,9 +183,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
switch (blkd_state) {
case 0:
case RCU_EXP_TASKS:
- case RCU_EXP_TASKS + RCU_GP_BLKD:
+ case RCU_EXP_TASKS | RCU_GP_BLKD:
case RCU_GP_TASKS:
- case RCU_GP_TASKS + RCU_EXP_TASKS:
+ case RCU_GP_TASKS | RCU_EXP_TASKS:
/*
* Blocking neither GP, or first task blocking the normal
@@ -198,10 +198,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
case RCU_EXP_BLKD:
case RCU_GP_BLKD:
- case RCU_GP_BLKD + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
+ case RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
/*
* First task arriving that blocks either GP, or first task
@@ -214,9 +214,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
break;
- case RCU_EXP_TASKS + RCU_EXP_BLKD:
- case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD:
+ case RCU_EXP_TASKS | RCU_EXP_BLKD:
+ case RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_EXP_BLKD:
/*
* Second or subsequent task blocking the expedited GP.
@@ -227,8 +227,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
list_add(&t->rcu_node_entry, rnp->exp_tasks);
break;
- case RCU_GP_TASKS + RCU_GP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
+ case RCU_GP_TASKS | RCU_GP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD:
/*
* Second or subsequent task blocking the normal GP.
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 4432db6d0b99..925fcdad5dea 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -76,36 +76,6 @@ int rcu_jiffies_till_stall_check(void)
}
EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
-/**
- * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled?
- *
- * Returns @true if the current grace period is sufficiently old that
- * it is reasonable to assume that it might be stalled. This can be
- * useful when deciding whether to allocate memory to enable RCU-mediated
- * freeing on the one hand or just invoking synchronize_rcu() on the other.
- * The latter is preferable when the grace period is stalled.
- *
- * Note that sampling of the .gp_start and .gp_seq fields must be done
- * carefully to avoid false positives at the beginnings and ends of
- * grace periods.
- */
-bool rcu_gp_might_be_stalled(void)
-{
- unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV;
- unsigned long j = jiffies;
-
- if (d < RCU_STALL_MIGHT_MIN)
- d = RCU_STALL_MIGHT_MIN;
- smp_mb(); // jiffies before .gp_seq to avoid false positives.
- if (!rcu_gp_in_progress())
- return false;
- // Long delays at this point avoids false positive, but a delay
- // of ULONG_MAX/4 jiffies voids your no-false-positive warranty.
- smp_mb(); // .gp_seq before second .gp_start
- // And ditto here.
- return !time_before(j, READ_ONCE(rcu_state.gp_start) + d);
-}
-
/* Don't do RCU CPU stall warnings during long sysrq printouts. */
void rcu_sysrq_start(void)
{
@@ -365,7 +335,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
* that don't support NMI-based stack dumps. The NMI-triggered stack
* traces are more accurate because they are printed by the target CPU.
*/
-static void rcu_dump_cpu_stacks(void)
+static void rcu_dump_cpu_stacks(unsigned long gp_seq)
{
int cpu;
unsigned long flags;
@@ -373,15 +343,23 @@ static void rcu_dump_cpu_stacks(void)
rcu_for_each_leaf_node(rnp) {
printk_deferred_enter();
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- for_each_leaf_node_possible_cpu(rnp, cpu)
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ if (gp_seq != data_race(rcu_state.gp_seq)) {
+ printk_deferred_exit();
+ pr_err("INFO: Stall ended during stack backtracing.\n");
+ return;
+ }
+ if (!(data_race(rnp->qsmask) & leaf_node_cpu_bit(rnp, cpu)))
+ continue;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
if (cpu_is_offline(cpu))
pr_err("Offline CPU %d blocking current GP.\n", cpu);
else
dump_cpu_task(cpu);
}
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
printk_deferred_exit();
}
}
@@ -638,7 +616,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen,
data_race(rcu_state.n_online_cpus)); // Diagnostic read
if (ndetected) {
- rcu_dump_cpu_stacks();
+ rcu_dump_cpu_stacks(gp_seq);
/* Complain about tasks blocking the grace period. */
rcu_for_each_leaf_node(rnp)
@@ -670,7 +648,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
rcu_force_quiescent_state(); /* Kick them all. */
}
-static void print_cpu_stall(unsigned long gps)
+static void print_cpu_stall(unsigned long gp_seq, unsigned long gps)
{
int cpu;
unsigned long flags;
@@ -707,7 +685,7 @@ static void print_cpu_stall(unsigned long gps)
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
- rcu_dump_cpu_stacks();
+ rcu_dump_cpu_stacks(gp_seq);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
/* Rewrite if needed in case of slow consoles. */
@@ -789,7 +767,8 @@ static void check_cpu_stall(struct rcu_data *rdp)
gs2 = READ_ONCE(rcu_state.gp_seq);
if (gs1 != gs2 ||
ULONG_CMP_LT(j, js) ||
- ULONG_CMP_GE(gps, js))
+ ULONG_CMP_GE(gps, js) ||
+ !rcu_seq_state(gs2))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
jn = jiffies + ULONG_MAX / 2;
@@ -810,7 +789,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);
} else if (self_detected) {
/* We haven't checked in, so go dump stack. */
- print_cpu_stall(gps);
+ print_cpu_stall(gs2, gps);
} else {
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(gs2, gps);