aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt140
-rw-r--r--Documentation/core-api/kernel-api.rst12
-rw-r--r--MAINTAINERS2
-rw-r--r--include/linux/notifier.h10
-rw-r--r--include/linux/rcupdate.h29
-rw-r--r--kernel/locking/locktorture.c51
-rw-r--r--kernel/rcu/rcuscale.c199
-rw-r--r--kernel/rcu/tasks.h5
-rw-r--r--kernel/rcu/tree.c58
-rw-r--r--kernel/rcu/tree_nocb.h52
-rw-r--r--tools/testing/selftests/rcutorture/bin/functions.sh2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot2
13 files changed, 322 insertions, 242 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9e5bab29685f..5b544b3f00fc 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4736,43 +4736,6 @@
the propagation of recent CPU-hotplug changes up
the rcu_node combining tree.
- rcutree.use_softirq= [KNL]
- If set to zero, move all RCU_SOFTIRQ processing to
- per-CPU rcuc kthreads. Defaults to a non-zero
- value, meaning that RCU_SOFTIRQ is used by default.
- Specify rcutree.use_softirq=0 to use rcuc kthreads.
-
- But note that CONFIG_PREEMPT_RT=y kernels disable
- this kernel boot parameter, forcibly setting it
- to zero.
-
- rcutree.rcu_fanout_exact= [KNL]
- Disable autobalancing of the rcu_node combining
- tree. This is used by rcutorture, and might
- possibly be useful for architectures having high
- cache-to-cache transfer latencies.
-
- rcutree.rcu_fanout_leaf= [KNL]
- Change the number of CPUs assigned to each
- leaf rcu_node structure. Useful for very
- large systems, which will choose the value 64,
- and for NUMA systems with large remote-access
- latencies, which will choose a value aligned
- with the appropriate hardware boundaries.
-
- rcutree.rcu_min_cached_objs= [KNL]
- Minimum number of objects which are cached and
- maintained per one CPU. Object size is equal
- to PAGE_SIZE. The cache allows to reduce the
- pressure to page allocator, also it makes the
- whole algorithm to behave better in low memory
- condition.
-
- rcutree.rcu_delay_page_cache_fill_msec= [KNL]
- Set the page-cache refill delay (in milliseconds)
- in response to low-memory conditions. The range
- of permitted values is in the range 0:100000.
-
rcutree.jiffies_till_first_fqs= [KNL]
Set delay from grace-period initialization to
first attempt to force quiescent states.
@@ -4811,21 +4774,6 @@
When RCU_NOCB_CPU is set, also adjust the
priority of NOCB callback kthreads.
- rcutree.rcu_divisor= [KNL]
- Set the shift-right count to use to compute
- the callback-invocation batch limit bl from
- the number of callbacks queued on this CPU.
- The result will be bounded below by the value of
- the rcutree.blimit kernel parameter. Every bl
- callbacks, the softirq handler will exit in
- order to allow the CPU to do other work.
-
- Please note that this callback-invocation batch
- limit applies only to non-offloaded callback
- invocation. Offloaded callbacks are instead
- invoked in the context of an rcuoc kthread, which
- scheduler will preempt as it does any other task.
-
rcutree.nocb_nobypass_lim_per_jiffy= [KNL]
On callback-offloaded (rcu_nocbs) CPUs,
RCU reduces the lock contention that would
@@ -4839,14 +4787,6 @@
the ->nocb_bypass queue. The definition of "too
many" is supplied by this kernel boot parameter.
- rcutree.rcu_nocb_gp_stride= [KNL]
- Set the number of NOCB callback kthreads in
- each group, which defaults to the square root
- of the number of CPUs. Larger numbers reduce
- the wakeup overhead on the global grace-period
- kthread, but increases that same overhead on
- each group's NOCB grace-period kthread.
-
rcutree.qhimark= [KNL]
Set threshold of queued RCU callbacks beyond which
batch limiting is disabled.
@@ -4864,6 +4804,56 @@
on rcutree.qhimark at boot time and to zero to
disable more aggressive help enlistment.
+ rcutree.rcu_delay_page_cache_fill_msec= [KNL]
+ Set the page-cache refill delay (in milliseconds)
+ in response to low-memory conditions. The range
+ of permitted values is in the range 0:100000.
+
+ rcutree.rcu_divisor= [KNL]
+ Set the shift-right count to use to compute
+ the callback-invocation batch limit bl from
+ the number of callbacks queued on this CPU.
+ The result will be bounded below by the value of
+ the rcutree.blimit kernel parameter. Every bl
+ callbacks, the softirq handler will exit in
+ order to allow the CPU to do other work.
+
+ Please note that this callback-invocation batch
+ limit applies only to non-offloaded callback
+ invocation. Offloaded callbacks are instead
+ invoked in the context of an rcuoc kthread, which
+ scheduler will preempt as it does any other task.
+
+ rcutree.rcu_fanout_exact= [KNL]
+ Disable autobalancing of the rcu_node combining
+ tree. This is used by rcutorture, and might
+ possibly be useful for architectures having high
+ cache-to-cache transfer latencies.
+
+ rcutree.rcu_fanout_leaf= [KNL]
+ Change the number of CPUs assigned to each
+ leaf rcu_node structure. Useful for very
+ large systems, which will choose the value 64,
+ and for NUMA systems with large remote-access
+ latencies, which will choose a value aligned
+ with the appropriate hardware boundaries.
+
+ rcutree.rcu_min_cached_objs= [KNL]
+ Minimum number of objects which are cached and
+ maintained per one CPU. Object size is equal
+ to PAGE_SIZE. The cache allows to reduce the
+ pressure to page allocator, also it makes the
+ whole algorithm to behave better in low memory
+ condition.
+
+ rcutree.rcu_nocb_gp_stride= [KNL]
+ Set the number of NOCB callback kthreads in
+ each group, which defaults to the square root
+ of the number of CPUs. Larger numbers reduce
+ the wakeup overhead on the global grace-period
+ kthread, but increases that same overhead on
+ each group's NOCB grace-period kthread.
+
rcutree.rcu_kick_kthreads= [KNL]
Cause the grace-period kthread to get an extra
wake_up() if it sleeps three times longer than
@@ -4871,6 +4861,13 @@
This wake_up() will be accompanied by a
WARN_ONCE() splat and an ftrace_dump().
+ rcutree.rcu_resched_ns= [KNL]
+ Limit the time spend invoking a batch of RCU
+ callbacks to the specified number of nanoseconds.
+ By default, this limit is checked only once
+ every 32 callbacks in order to limit the pain
+ inflicted by local_clock() overhead.
+
rcutree.rcu_unlock_delay= [KNL]
In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels,
this specifies an rcu_read_unlock()-time delay
@@ -4885,6 +4882,16 @@
rcu_node tree with an eye towards determining
why a new grace period has not yet started.
+ rcutree.use_softirq= [KNL]
+ If set to zero, move all RCU_SOFTIRQ processing to
+ per-CPU rcuc kthreads. Defaults to a non-zero
+ value, meaning that RCU_SOFTIRQ is used by default.
+ Specify rcutree.use_softirq=0 to use rcuc kthreads.
+
+ But note that CONFIG_PREEMPT_RT=y kernels disable
+ this kernel boot parameter, forcibly setting it
+ to zero.
+
rcuscale.gp_async= [KNL]
Measure performance of asynchronous
grace-period primitives such as call_rcu().
@@ -5087,8 +5094,17 @@
rcutorture.stall_cpu_block= [KNL]
Sleep while stalling if set. This will result
- in warnings from preemptible RCU in addition
- to any other stall-related activity.
+ in warnings from preemptible RCU in addition to
+ any other stall-related activity. Note that
+ in kernels built with CONFIG_PREEMPTION=n and
+ CONFIG_PREEMPT_COUNT=y, this parameter will
+ cause the CPU to pass through a quiescent state.
+ Given CONFIG_PREEMPTION=n, this will suppress
+ RCU CPU stall warnings, but will instead result
+ in scheduling-while-atomic splats.
+
+ Use of this module parameter results in splats.
+
rcutorture.stall_cpu_holdoff= [KNL]
Time to wait (s) after boot before inducing stall.
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 9b3f3e5f5a95..712e59ad32fa 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -412,3 +412,15 @@ Read-Copy Update (RCU)
.. kernel-doc:: include/linux/rcu_sync.h
.. kernel-doc:: kernel/rcu/sync.c
+
+.. kernel-doc:: kernel/rcu/tasks.h
+
+.. kernel-doc:: kernel/rcu/tree_stall.h
+
+.. kernel-doc:: include/linux/rcupdate_trace.h
+
+.. kernel-doc:: include/linux/rcupdate_wait.h
+
+.. kernel-doc:: include/linux/rcuref.h
+
+.. kernel-doc:: include/linux/rcutree.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 7e0b87d5aa2e..8af0aabe9bf5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17778,7 +17778,7 @@ M: Boqun Feng <[email protected]>
R: Steven Rostedt <[email protected]>
R: Mathieu Desnoyers <[email protected]>
R: Lai Jiangshan <[email protected]>
-R: Zqiang <[email protected]>
+R: Zqiang <[email protected]>
S: Supported
W: http://www.rdrop.com/users/paulmck/RCU/
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 2aba75145144..86544707236a 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -106,12 +106,22 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
#define RAW_NOTIFIER_INIT(name) { \
.head = NULL }
+#ifdef CONFIG_TREE_SRCU
#define SRCU_NOTIFIER_INIT(name, pcpu) \
{ \
.mutex = __MUTEX_INITIALIZER(name.mutex), \
.head = NULL, \
+ .srcuu = __SRCU_USAGE_INIT(name.srcuu), \
.srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \
}
+#else
+#define SRCU_NOTIFIER_INIT(name, pcpu) \
+ { \
+ .mutex = __MUTEX_INITIALIZER(name.mutex), \
+ .head = NULL, \
+ .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \
+ }
+#endif
#define ATOMIC_NOTIFIER_HEAD(name) \
struct atomic_notifier_head name = \
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index aae31a3e28dd..7d9c2a63b7cd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -932,9 +932,8 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
/**
* kfree_rcu() - kfree an object after a grace period.
- * @ptr: pointer to kfree for both single- and double-argument invocations.
- * @rhf: the name of the struct rcu_head within the type of @ptr,
- * but only for double-argument invocations.
+ * @ptr: pointer to kfree for double-argument invocations.
+ * @rhf: the name of the struct rcu_head within the type of @ptr.
*
* Many rcu callbacks functions just call kfree() on the base structure.
* These functions are trivial, but their size adds up, and furthermore
@@ -959,26 +958,18 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
* The BUILD_BUG_ON check must not involve any function calls, hence the
* checks are done in macros here.
*/
-#define kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf)
+#define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)
+#define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)
/**
- * kvfree_rcu() - kvfree an object after a grace period.
- *
- * This macro consists of one or two arguments and it is
- * based on whether an object is head-less or not. If it
- * has a head then a semantic stays the same as it used
- * to be before:
- *
- * kvfree_rcu(ptr, rhf);
- *
- * where @ptr is a pointer to kvfree(), @rhf is the name
- * of the rcu_head structure within the type of @ptr.
+ * kfree_rcu_mightsleep() - kfree an object after a grace period.
+ * @ptr: pointer to kfree for single-argument invocations.
*
* When it comes to head-less variant, only one argument
* is passed and that is just a pointer which has to be
* freed after a grace period. Therefore the semantic is
*
- * kvfree_rcu(ptr);
+ * kfree_rcu_mightsleep(ptr);
*
* where @ptr is the pointer to be freed by kvfree().
*
@@ -987,13 +978,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
* annotation. Otherwise, please switch and embed the
* rcu_head structure within the type of @ptr.
*/
-#define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \
- kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__)
-
+#define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)
#define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)
-#define kfree_rcu_mightsleep(ptr) kvfree_rcu_mightsleep(ptr)
-#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME
#define kvfree_rcu_arg_2(ptr, rhf) \
do { \
typeof (ptr) ___p = (ptr); \
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 153ddc4c47ef..949d3deae506 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -33,24 +33,19 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <[email protected]>");
-torture_param(int, nwriters_stress, -1,
- "Number of write-locking stress-test threads");
-torture_param(int, nreaders_stress, -1,
- "Number of read-locking stress-test threads");
+torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
+torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (s), 0=disable");
-torture_param(int, shuffle_interval, 3,
- "Number of jiffies between shuffles, 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable");
torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
torture_param(int, rt_boost, 2,
- "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+ "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
-torture_param(int, verbose, 1,
- "Enable verbose debugging printk()s");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
#define MAX_NESTED_LOCKS 8
@@ -120,7 +115,7 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused)
static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
@@ -198,16 +193,18 @@ __acquires(torture_spinlock)
static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
+ unsigned long j;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) {
+ j = jiffies;
mdelay(longdelay_ms);
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j);
+ }
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
@@ -322,7 +319,7 @@ __acquires(torture_rwlock)
static void torture_rwlock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
@@ -455,14 +452,12 @@ __acquires(torture_mutex)
static void torture_mutex_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2000 * longdelay_ms)))
mdelay(longdelay_ms * 5);
- else
- mdelay(longdelay_ms / 5);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -630,7 +625,7 @@ __acquires(torture_rtmutex)
static void torture_rtmutex_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/*
* We want a short delay mostly to emulate likely code, and
@@ -640,7 +635,7 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp)
(cxt.nrealwriters_stress * 2000 * longdelay_ms)))
mdelay(longdelay_ms);
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
@@ -695,14 +690,12 @@ __acquires(torture_rwsem)
static void torture_rwsem_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2000 * longdelay_ms)))
mdelay(longdelay_ms * 10);
- else
- mdelay(longdelay_ms / 10);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -848,8 +841,8 @@ static int lock_torture_writer(void *arg)
lwsp->n_lock_acquired++;
}
- cxt.cur_ops->write_delay(&rand);
if (!skip_main_lock) {
+ cxt.cur_ops->write_delay(&rand);
lock_is_write_held = false;
WRITE_ONCE(last_lock_release, jiffies);
cxt.cur_ops->writeunlock(tid);
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index e82ec9f9a5d8..d1221731c7cf 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -522,89 +522,6 @@ rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag)
scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
}
-static void
-rcu_scale_cleanup(void)
-{
- int i;
- int j;
- int ngps = 0;
- u64 *wdp;
- u64 *wdpp;
-
- /*
- * Would like warning at start, but everything is expedited
- * during the mid-boot phase, so have to wait till the end.
- */
- if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
- SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
- if (rcu_gp_is_normal() && gp_exp)
- SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
- if (gp_exp && gp_async)
- SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
-
- if (torture_cleanup_begin())
- return;
- if (!cur_ops) {
- torture_cleanup_end();
- return;
- }
-
- if (reader_tasks) {
- for (i = 0; i < nrealreaders; i++)
- torture_stop_kthread(rcu_scale_reader,
- reader_tasks[i]);
- kfree(reader_tasks);
- }
-
- if (writer_tasks) {
- for (i = 0; i < nrealwriters; i++) {
- torture_stop_kthread(rcu_scale_writer,
- writer_tasks[i]);
- if (!writer_n_durations)
- continue;
- j = writer_n_durations[i];
- pr_alert("%s%s writer %d gps: %d\n",
- scale_type, SCALE_FLAG, i, j);
- ngps += j;
- }
- pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
- scale_type, SCALE_FLAG,
- t_rcu_scale_writer_started, t_rcu_scale_writer_finished,
- t_rcu_scale_writer_finished -
- t_rcu_scale_writer_started,
- ngps,
- rcuscale_seq_diff(b_rcu_gp_test_finished,
- b_rcu_gp_test_started));
- for (i = 0; i < nrealwriters; i++) {
- if (!writer_durations)
- break;
- if (!writer_n_durations)
- continue;
- wdpp = writer_durations[i];
- if (!wdpp)
- continue;
- for (j = 0; j < writer_n_durations[i]; j++) {
- wdp = &wdpp[j];
- pr_alert("%s%s %4d writer-duration: %5d %llu\n",
- scale_type, SCALE_FLAG,
- i, j, *wdp);
- if (j % 100 == 0)
- schedule_timeout_uninterruptible(1);
- }
- kfree(writer_durations[i]);
- }
- kfree(writer_tasks);
- kfree(writer_durations);
- kfree(writer_n_durations);
- }
-
- /* Do torture-type-specific cleanup operations. */
- if (cur_ops->cleanup != NULL)
- cur_ops->cleanup();
-
- torture_cleanup_end();
-}
-
/*
* Return the number if non-negative. If -1, the number of CPUs.
* If less than -1, that much less than the number of CPUs, but
@@ -625,20 +542,6 @@ static int compute_real(int n)
}
/*
- * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
- * down system.
- */
-static int
-rcu_scale_shutdown(void *arg)
-{
- wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
- smp_mb(); /* Wake before output. */
- rcu_scale_cleanup();
- kernel_power_off();
- return -EINVAL;
-}
-
-/*
* kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number
* of iterations and measure total time and number of GP for all iterations to complete.
*/
@@ -874,6 +777,108 @@ unwind:
return firsterr;
}
+static void
+rcu_scale_cleanup(void)
+{
+ int i;
+ int j;
+ int ngps = 0;
+ u64 *wdp;
+ u64 *wdpp;
+
+ /*
+ * Would like warning at start, but everything is expedited
+ * during the mid-boot phase, so have to wait till the end.
+ */
+ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ if (rcu_gp_is_normal() && gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ if (gp_exp && gp_async)
+ SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
+
+ if (kfree_rcu_test) {
+ kfree_scale_cleanup();
+ return;
+ }
+
+ if (torture_cleanup_begin())
+ return;
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_scale_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ }
+
+ if (writer_tasks) {
+ for (i = 0; i < nrealwriters; i++) {
+ torture_stop_kthread(rcu_scale_writer,
+ writer_tasks[i]);
+ if (!writer_n_durations)
+ continue;
+ j = writer_n_durations[i];
+ pr_alert("%s%s writer %d gps: %d\n",
+ scale_type, SCALE_FLAG, i, j);
+ ngps += j;
+ }
+ pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
+ scale_type, SCALE_FLAG,
+ t_rcu_scale_writer_started, t_rcu_scale_writer_finished,
+ t_rcu_scale_writer_finished -
+ t_rcu_scale_writer_started,
+ ngps,
+ rcuscale_seq_diff(b_rcu_gp_test_finished,
+ b_rcu_gp_test_started));
+ for (i = 0; i < nrealwriters; i++) {
+ if (!writer_durations)
+ break;
+ if (!writer_n_durations)
+ continue;
+ wdpp = writer_durations[i];
+ if (!wdpp)
+ continue;
+ for (j = 0; j < writer_n_durations[i]; j++) {
+ wdp = &wdpp[j];
+ pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+ scale_type, SCALE_FLAG,
+ i, j, *wdp);
+ if (j % 100 == 0)
+ schedule_timeout_uninterruptible(1);
+ }
+ kfree(writer_durations[i]);
+ }
+ kfree(writer_tasks);
+ kfree(writer_durations);
+ kfree(writer_n_durations);
+ }
+
+ /* Do torture-type-specific cleanup operations. */
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+/*
+ * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
+ * down system.
+ */
+static int
+rcu_scale_shutdown(void *arg)
+{
+ wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
+ smp_mb(); /* Wake before output. */
+ rcu_scale_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
static int __init
rcu_scale_init(void)
{
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 8f08c087142b..b770add3f843 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -241,7 +241,6 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
if (rcu_task_enqueue_lim < 0) {
rcu_task_enqueue_lim = 1;
rcu_task_cb_adjust = true;
- pr_info("%s: Setting adjustable number of callback queues.\n", __func__);
} else if (rcu_task_enqueue_lim == 0) {
rcu_task_enqueue_lim = 1;
}
@@ -272,7 +271,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
}
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
- pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim));
+
+ pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
+ data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust);
}
// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic().
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 54963f8c244c..1449cb69a0e0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2778,7 +2778,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
*/
struct kvfree_rcu_bulk_data {
struct list_head list;
- unsigned long gp_snap;
+ struct rcu_gp_oldstate gp_snap;
unsigned long nr_records;
void *records[];
};
@@ -2795,6 +2795,7 @@ struct kvfree_rcu_bulk_data {
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
* @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
@@ -2802,6 +2803,7 @@ struct kvfree_rcu_bulk_data {
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
+ struct rcu_gp_oldstate head_free_gp_snap;
struct list_head bulk_head_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
@@ -2922,6 +2924,9 @@ drain_page_cache(struct kfree_rcu_cpu *krcp)
struct llist_node *page_list, *pos, *n;
int freed = 0;
+ if (!rcu_min_cached_objs)
+ return 0;
+
raw_spin_lock_irqsave(&krcp->lock, flags);
page_list = llist_del_all(&krcp->bkvcache);
WRITE_ONCE(krcp->nr_bkv_objs, 0);
@@ -2942,24 +2947,25 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
unsigned long flags;
int i;
- debug_rcu_bhead_unqueue(bnode);
-
- rcu_lock_acquire(&rcu_callback_map);
- if (idx == 0) { // kmalloc() / kfree().
- trace_rcu_invoke_kfree_bulk_callback(
- rcu_state.name, bnode->nr_records,
- bnode->records);
-
- kfree_bulk(bnode->nr_records, bnode->records);
- } else { // vmalloc() / vfree().
- for (i = 0; i < bnode->nr_records; i++) {
- trace_rcu_invoke_kvfree_callback(
- rcu_state.name, bnode->records[i], 0);
-
- vfree(bnode->records[i]);
+ if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
+ debug_rcu_bhead_unqueue(bnode);
+ rcu_lock_acquire(&rcu_callback_map);
+ if (idx == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ rcu_state.name, bnode->nr_records,
+ bnode->records);
+
+ kfree_bulk(bnode->nr_records, bnode->records);
+ } else { // vmalloc() / vfree().
+ for (i = 0; i < bnode->nr_records; i++) {
+ trace_rcu_invoke_kvfree_callback(
+ rcu_state.name, bnode->records[i], 0);
+
+ vfree(bnode->records[i]);
+ }
}
+ rcu_lock_release(&rcu_callback_map);
}
- rcu_lock_release(&rcu_callback_map);
raw_spin_lock_irqsave(&krcp->lock, flags);
if (put_cached_bnode(krcp, bnode))
@@ -3006,6 +3012,7 @@ static void kfree_rcu_work(struct work_struct *work)
struct rcu_head *head;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
+ struct rcu_gp_oldstate head_gp_snap;
int i;
krwp = container_of(to_rcu_work(work),
@@ -3020,6 +3027,7 @@ static void kfree_rcu_work(struct work_struct *work)
// Channel 3.
head = krwp->head_free;
krwp->head_free = NULL;
+ head_gp_snap = krwp->head_free_gp_snap;
raw_spin_unlock_irqrestore(&krcp->lock, flags);
// Handle the first two channels.
@@ -3036,7 +3044,8 @@ static void kfree_rcu_work(struct work_struct *work)
* queued on a linked list through their rcu_head structures.
* This list is named "Channel 3".
*/
- kvfree_rcu_list(head);
+ if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
+ kvfree_rcu_list(head);
}
static bool
@@ -3103,7 +3112,7 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
INIT_LIST_HEAD(&bulk_ready[i]);
list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
- if (!poll_state_synchronize_rcu(bnode->gp_snap))
+ if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
break;
atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
@@ -3168,6 +3177,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
// objects queued on the linked list.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
+ get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
atomic_set(&krcp->head_count, 0);
WRITE_ONCE(krcp->head, NULL);
}
@@ -3216,7 +3226,7 @@ static void fill_page_cache_func(struct work_struct *work)
nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
1 : rcu_min_cached_objs;
- for (i = 0; i < nr_pages; i++) {
+ for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -3240,6 +3250,10 @@ static void fill_page_cache_func(struct work_struct *work)
static void
run_page_cache_worker(struct kfree_rcu_cpu *krcp)
{
+ // If cache disabled, bail out.
+ if (!rcu_min_cached_objs)
+ return;
+
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
if (atomic_read(&krcp->backoff_page_cache_fill)) {
@@ -3294,7 +3308,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
// scenarios.
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
- *krcp = krc_this_cpu_lock(flags);
+ raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
}
if (!bnode)
@@ -3307,7 +3321,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
// Finally insert and update the GP for this page.
bnode->records[bnode->nr_records++] = ptr;
- bnode->gp_snap = get_state_synchronize_rcu();
+ get_state_synchronize_rcu_full(&bnode->gp_snap);
atomic_inc(&(*krcp)->bulk_count[idx]);
return true;
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index f2280616f9d5..43229d2b0c44 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1319,13 +1319,22 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
int cpu;
unsigned long count = 0;
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+
+ /* Protect rcu_nocb_mask against concurrent (de-)offloading. */
+ if (!mutex_trylock(&rcu_state.barrier_mutex))
+ return 0;
+
/* Snapshot count of all CPUs */
- for_each_possible_cpu(cpu) {
+ for_each_cpu(cpu, rcu_nocb_mask) {
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
count += READ_ONCE(rdp->lazy_len);
}
+ mutex_unlock(&rcu_state.barrier_mutex);
+
return count ? count : SHRINK_EMPTY;
}
@@ -1336,15 +1345,45 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
unsigned long flags;
unsigned long count = 0;
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+ /*
+ * Protect against concurrent (de-)offloading. Otherwise nocb locking
+ * may be ignored or imbalanced.
+ */
+ if (!mutex_trylock(&rcu_state.barrier_mutex)) {
+ /*
+ * But really don't insist if barrier_mutex is contended since we
+ * can't guarantee that it will never engage in a dependency
+ * chain involving memory allocation. The lock is seldom contended
+ * anyway.
+ */
+ return 0;
+ }
+
/* Snapshot count of all CPUs */
- for_each_possible_cpu(cpu) {
+ for_each_cpu(cpu, rcu_nocb_mask) {
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int _count = READ_ONCE(rdp->lazy_len);
+ int _count;
+
+ if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)))
+ continue;
- if (_count == 0)
+ if (!READ_ONCE(rdp->lazy_len))
continue;
+
rcu_nocb_lock_irqsave(rdp, flags);
- WRITE_ONCE(rdp->lazy_len, 0);
+ /*
+ * Recheck under the nocb lock. Since we are not holding the bypass
+ * lock we may still race with increments from the enqueuer but still
+ * we know for sure if there is at least one lazy callback.
+ */
+ _count = READ_ONCE(rdp->lazy_len);
+ if (!_count) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue;
+ }
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
rcu_nocb_unlock_irqrestore(rdp, flags);
wake_nocb_gp(rdp, false);
sc->nr_to_scan -= _count;
@@ -1352,6 +1391,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
if (sc->nr_to_scan <= 0)
break;
}
+
+ mutex_unlock(&rcu_state.barrier_mutex);
+
return count ? count : SHRINK_STOP;
}
diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
index b52d5069563c..48b9147e8c91 100644
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -250,7 +250,7 @@ identify_qemu_args () {
echo -machine virt,gic-version=host -cpu host
;;
qemu-system-ppc64)
- echo -enable-kvm -M pseries -nodefaults
+ echo -M pseries -nodefaults
echo -device spapr-vscsi
if test -n "$TORTURE_QEMU_INTERACTIVE" -a -n "$TORTURE_QEMU_MAC"
then
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot
index f57720c52c0f..84f6bb98ce99 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot
@@ -5,4 +5,4 @@ rcutree.gp_init_delay=3
rcutree.gp_cleanup_delay=3
rcutree.kthread_prio=2
threadirqs
-tree.use_softirq=0
+rcutree.use_softirq=0
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
index 64f864f1f361..8e50bfd4b710 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
@@ -4,4 +4,4 @@ rcutree.gp_init_delay=3
rcutree.gp_cleanup_delay=3
rcutree.kthread_prio=2
threadirqs
-tree.use_softirq=0
+rcutree.use_softirq=0