diff options
-rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 140 | ||||
-rw-r--r-- | Documentation/core-api/kernel-api.rst | 12 | ||||
-rw-r--r-- | MAINTAINERS | 2 | ||||
-rw-r--r-- | include/linux/notifier.h | 10 | ||||
-rw-r--r-- | include/linux/rcupdate.h | 29 | ||||
-rw-r--r-- | kernel/locking/locktorture.c | 51 | ||||
-rw-r--r-- | kernel/rcu/rcuscale.c | 199 | ||||
-rw-r--r-- | kernel/rcu/tasks.h | 5 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 58 | ||||
-rw-r--r-- | kernel/rcu/tree_nocb.h | 52 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/bin/functions.sh | 2 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot | 2 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot | 2 |
13 files changed, 322 insertions, 242 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9e5bab29685f..5b544b3f00fc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4736,43 +4736,6 @@ the propagation of recent CPU-hotplug changes up the rcu_node combining tree. - rcutree.use_softirq= [KNL] - If set to zero, move all RCU_SOFTIRQ processing to - per-CPU rcuc kthreads. Defaults to a non-zero - value, meaning that RCU_SOFTIRQ is used by default. - Specify rcutree.use_softirq=0 to use rcuc kthreads. - - But note that CONFIG_PREEMPT_RT=y kernels disable - this kernel boot parameter, forcibly setting it - to zero. - - rcutree.rcu_fanout_exact= [KNL] - Disable autobalancing of the rcu_node combining - tree. This is used by rcutorture, and might - possibly be useful for architectures having high - cache-to-cache transfer latencies. - - rcutree.rcu_fanout_leaf= [KNL] - Change the number of CPUs assigned to each - leaf rcu_node structure. Useful for very - large systems, which will choose the value 64, - and for NUMA systems with large remote-access - latencies, which will choose a value aligned - with the appropriate hardware boundaries. - - rcutree.rcu_min_cached_objs= [KNL] - Minimum number of objects which are cached and - maintained per one CPU. Object size is equal - to PAGE_SIZE. The cache allows to reduce the - pressure to page allocator, also it makes the - whole algorithm to behave better in low memory - condition. - - rcutree.rcu_delay_page_cache_fill_msec= [KNL] - Set the page-cache refill delay (in milliseconds) - in response to low-memory conditions. The range - of permitted values is in the range 0:100000. - rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. @@ -4811,21 +4774,6 @@ When RCU_NOCB_CPU is set, also adjust the priority of NOCB callback kthreads. - rcutree.rcu_divisor= [KNL] - Set the shift-right count to use to compute - the callback-invocation batch limit bl from - the number of callbacks queued on this CPU. - The result will be bounded below by the value of - the rcutree.blimit kernel parameter. Every bl - callbacks, the softirq handler will exit in - order to allow the CPU to do other work. - - Please note that this callback-invocation batch - limit applies only to non-offloaded callback - invocation. Offloaded callbacks are instead - invoked in the context of an rcuoc kthread, which - scheduler will preempt as it does any other task. - rcutree.nocb_nobypass_lim_per_jiffy= [KNL] On callback-offloaded (rcu_nocbs) CPUs, RCU reduces the lock contention that would @@ -4839,14 +4787,6 @@ the ->nocb_bypass queue. The definition of "too many" is supplied by this kernel boot parameter. - rcutree.rcu_nocb_gp_stride= [KNL] - Set the number of NOCB callback kthreads in - each group, which defaults to the square root - of the number of CPUs. Larger numbers reduce - the wakeup overhead on the global grace-period - kthread, but increases that same overhead on - each group's NOCB grace-period kthread. - rcutree.qhimark= [KNL] Set threshold of queued RCU callbacks beyond which batch limiting is disabled. @@ -4864,6 +4804,56 @@ on rcutree.qhimark at boot time and to zero to disable more aggressive help enlistment. + rcutree.rcu_delay_page_cache_fill_msec= [KNL] + Set the page-cache refill delay (in milliseconds) + in response to low-memory conditions. The range + of permitted values is in the range 0:100000. + + rcutree.rcu_divisor= [KNL] + Set the shift-right count to use to compute + the callback-invocation batch limit bl from + the number of callbacks queued on this CPU. + The result will be bounded below by the value of + the rcutree.blimit kernel parameter. Every bl + callbacks, the softirq handler will exit in + order to allow the CPU to do other work. + + Please note that this callback-invocation batch + limit applies only to non-offloaded callback + invocation. Offloaded callbacks are instead + invoked in the context of an rcuoc kthread, which + scheduler will preempt as it does any other task. + + rcutree.rcu_fanout_exact= [KNL] + Disable autobalancing of the rcu_node combining + tree. This is used by rcutorture, and might + possibly be useful for architectures having high + cache-to-cache transfer latencies. + + rcutree.rcu_fanout_leaf= [KNL] + Change the number of CPUs assigned to each + leaf rcu_node structure. Useful for very + large systems, which will choose the value 64, + and for NUMA systems with large remote-access + latencies, which will choose a value aligned + with the appropriate hardware boundaries. + + rcutree.rcu_min_cached_objs= [KNL] + Minimum number of objects which are cached and + maintained per one CPU. Object size is equal + to PAGE_SIZE. The cache allows to reduce the + pressure to page allocator, also it makes the + whole algorithm to behave better in low memory + condition. + + rcutree.rcu_nocb_gp_stride= [KNL] + Set the number of NOCB callback kthreads in + each group, which defaults to the square root + of the number of CPUs. Larger numbers reduce + the wakeup overhead on the global grace-period + kthread, but increases that same overhead on + each group's NOCB grace-period kthread. + rcutree.rcu_kick_kthreads= [KNL] Cause the grace-period kthread to get an extra wake_up() if it sleeps three times longer than @@ -4871,6 +4861,13 @@ This wake_up() will be accompanied by a WARN_ONCE() splat and an ftrace_dump(). + rcutree.rcu_resched_ns= [KNL] + Limit the time spend invoking a batch of RCU + callbacks to the specified number of nanoseconds. + By default, this limit is checked only once + every 32 callbacks in order to limit the pain + inflicted by local_clock() overhead. + rcutree.rcu_unlock_delay= [KNL] In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, this specifies an rcu_read_unlock()-time delay @@ -4885,6 +4882,16 @@ rcu_node tree with an eye towards determining why a new grace period has not yet started. + rcutree.use_softirq= [KNL] + If set to zero, move all RCU_SOFTIRQ processing to + per-CPU rcuc kthreads. Defaults to a non-zero + value, meaning that RCU_SOFTIRQ is used by default. + Specify rcutree.use_softirq=0 to use rcuc kthreads. + + But note that CONFIG_PREEMPT_RT=y kernels disable + this kernel boot parameter, forcibly setting it + to zero. + rcuscale.gp_async= [KNL] Measure performance of asynchronous grace-period primitives such as call_rcu(). @@ -5087,8 +5094,17 @@ rcutorture.stall_cpu_block= [KNL] Sleep while stalling if set. This will result - in warnings from preemptible RCU in addition - to any other stall-related activity. + in warnings from preemptible RCU in addition to + any other stall-related activity. Note that + in kernels built with CONFIG_PREEMPTION=n and + CONFIG_PREEMPT_COUNT=y, this parameter will + cause the CPU to pass through a quiescent state. + Given CONFIG_PREEMPTION=n, this will suppress + RCU CPU stall warnings, but will instead result + in scheduling-while-atomic splats. + + Use of this module parameter results in splats. + rcutorture.stall_cpu_holdoff= [KNL] Time to wait (s) after boot before inducing stall. diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 9b3f3e5f5a95..712e59ad32fa 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -412,3 +412,15 @@ Read-Copy Update (RCU) .. kernel-doc:: include/linux/rcu_sync.h .. kernel-doc:: kernel/rcu/sync.c + +.. kernel-doc:: kernel/rcu/tasks.h + +.. kernel-doc:: kernel/rcu/tree_stall.h + +.. kernel-doc:: include/linux/rcupdate_trace.h + +.. kernel-doc:: include/linux/rcupdate_wait.h + +.. kernel-doc:: include/linux/rcuref.h + +.. kernel-doc:: include/linux/rcutree.h diff --git a/MAINTAINERS b/MAINTAINERS index 7e0b87d5aa2e..8af0aabe9bf5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17778,7 +17778,7 @@ M: Boqun Feng <[email protected]> R: Steven Rostedt <[email protected]> R: Mathieu Desnoyers <[email protected]> R: Lai Jiangshan <[email protected]> -R: Zqiang <[email protected]> +R: Zqiang <[email protected]> S: Supported W: http://www.rdrop.com/users/paulmck/RCU/ diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 2aba75145144..86544707236a 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -106,12 +106,22 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); #define RAW_NOTIFIER_INIT(name) { \ .head = NULL } +#ifdef CONFIG_TREE_SRCU #define SRCU_NOTIFIER_INIT(name, pcpu) \ { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ + .srcuu = __SRCU_USAGE_INIT(name.srcuu), \ .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ } +#else +#define SRCU_NOTIFIER_INIT(name, pcpu) \ + { \ + .mutex = __MUTEX_INITIALIZER(name.mutex), \ + .head = NULL, \ + .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ + } +#endif #define ATOMIC_NOTIFIER_HEAD(name) \ struct atomic_notifier_head name = \ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index aae31a3e28dd..7d9c2a63b7cd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -932,9 +932,8 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) /** * kfree_rcu() - kfree an object after a grace period. - * @ptr: pointer to kfree for both single- and double-argument invocations. - * @rhf: the name of the struct rcu_head within the type of @ptr, - * but only for double-argument invocations. + * @ptr: pointer to kfree for double-argument invocations. + * @rhf: the name of the struct rcu_head within the type of @ptr. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore @@ -959,26 +958,18 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ -#define kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf) +#define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) +#define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) /** - * kvfree_rcu() - kvfree an object after a grace period. - * - * This macro consists of one or two arguments and it is - * based on whether an object is head-less or not. If it - * has a head then a semantic stays the same as it used - * to be before: - * - * kvfree_rcu(ptr, rhf); - * - * where @ptr is a pointer to kvfree(), @rhf is the name - * of the rcu_head structure within the type of @ptr. + * kfree_rcu_mightsleep() - kfree an object after a grace period. + * @ptr: pointer to kfree for single-argument invocations. * * When it comes to head-less variant, only one argument * is passed and that is just a pointer which has to be * freed after a grace period. Therefore the semantic is * - * kvfree_rcu(ptr); + * kfree_rcu_mightsleep(ptr); * * where @ptr is the pointer to be freed by kvfree(). * @@ -987,13 +978,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * annotation. Otherwise, please switch and embed the * rcu_head structure within the type of @ptr. */ -#define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \ - kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__) - +#define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) -#define kfree_rcu_mightsleep(ptr) kvfree_rcu_mightsleep(ptr) -#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 153ddc4c47ef..949d3deae506 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -33,24 +33,19 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <[email protected]>"); -torture_param(int, nwriters_stress, -1, - "Number of write-locking stress-test threads"); -torture_param(int, nreaders_stress, -1, - "Number of read-locking stress-test threads"); +torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); +torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); -torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (s), 0=disable"); -torture_param(int, shuffle_interval, 3, - "Number of jiffies between shuffles, 0=disable"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); -torture_param(int, stat_interval, 60, - "Number of seconds between stats printk()s"); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); torture_param(int, rt_boost, 2, - "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); + "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); -torture_param(int, verbose, 1, - "Enable verbose debugging printk()s"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ #define MAX_NESTED_LOCKS 8 @@ -120,7 +115,7 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused) static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % @@ -198,16 +193,18 @@ __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; + unsigned long j; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) { + j = jiffies; mdelay(longdelay_ms); - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j); + } + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ @@ -322,7 +319,7 @@ __acquires(torture_rwlock) static void torture_rwlock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. @@ -455,14 +452,12 @@ __acquires(torture_mutex) static void torture_mutex_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 5); - else - mdelay(longdelay_ms / 5); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -630,7 +625,7 @@ __acquires(torture_rtmutex) static void torture_rtmutex_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* * We want a short delay mostly to emulate likely code, and @@ -640,7 +635,7 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp) (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms); if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ @@ -695,14 +690,12 @@ __acquires(torture_rwsem) static void torture_rwsem_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 10); - else - mdelay(longdelay_ms / 10); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -848,8 +841,8 @@ static int lock_torture_writer(void *arg) lwsp->n_lock_acquired++; } - cxt.cur_ops->write_delay(&rand); if (!skip_main_lock) { + cxt.cur_ops->write_delay(&rand); lock_is_write_held = false; WRITE_ONCE(last_lock_release, jiffies); cxt.cur_ops->writeunlock(tid); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index e82ec9f9a5d8..d1221731c7cf 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -522,89 +522,6 @@ rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); } -static void -rcu_scale_cleanup(void) -{ - int i; - int j; - int ngps = 0; - u64 *wdp; - u64 *wdpp; - - /* - * Would like warning at start, but everything is expedited - * during the mid-boot phase, so have to wait till the end. - */ - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - if (rcu_gp_is_normal() && gp_exp) - SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - if (gp_exp && gp_async) - SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); - - if (torture_cleanup_begin()) - return; - if (!cur_ops) { - torture_cleanup_end(); - return; - } - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_scale_reader, - reader_tasks[i]); - kfree(reader_tasks); - } - - if (writer_tasks) { - for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_scale_writer, - writer_tasks[i]); - if (!writer_n_durations) - continue; - j = writer_n_durations[i]; - pr_alert("%s%s writer %d gps: %d\n", - scale_type, SCALE_FLAG, i, j); - ngps += j; - } - pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - scale_type, SCALE_FLAG, - t_rcu_scale_writer_started, t_rcu_scale_writer_finished, - t_rcu_scale_writer_finished - - t_rcu_scale_writer_started, - ngps, - rcuscale_seq_diff(b_rcu_gp_test_finished, - b_rcu_gp_test_started)); - for (i = 0; i < nrealwriters; i++) { - if (!writer_durations) - break; - if (!writer_n_durations) - continue; - wdpp = writer_durations[i]; - if (!wdpp) - continue; - for (j = 0; j < writer_n_durations[i]; j++) { - wdp = &wdpp[j]; - pr_alert("%s%s %4d writer-duration: %5d %llu\n", - scale_type, SCALE_FLAG, - i, j, *wdp); - if (j % 100 == 0) - schedule_timeout_uninterruptible(1); - } - kfree(writer_durations[i]); - } - kfree(writer_tasks); - kfree(writer_durations); - kfree(writer_n_durations); - } - - /* Do torture-type-specific cleanup operations. */ - if (cur_ops->cleanup != NULL) - cur_ops->cleanup(); - - torture_cleanup_end(); -} - /* * Return the number if non-negative. If -1, the number of CPUs. * If less than -1, that much less than the number of CPUs, but @@ -625,20 +542,6 @@ static int compute_real(int n) } /* - * RCU scalability shutdown kthread. Just waits to be awakened, then shuts - * down system. - */ -static int -rcu_scale_shutdown(void *arg) -{ - wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); - smp_mb(); /* Wake before output. */ - rcu_scale_cleanup(); - kernel_power_off(); - return -EINVAL; -} - -/* * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number * of iterations and measure total time and number of GP for all iterations to complete. */ @@ -874,6 +777,108 @@ unwind: return firsterr; } +static void +rcu_scale_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + + if (kfree_rcu_test) { + kfree_scale_cleanup(); + return; + } + + if (torture_cleanup_begin()) + return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_scale_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_scale_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + scale_type, SCALE_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, + ngps, + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j < writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + scale_type, SCALE_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do torture-type-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_scale_shutdown(void *arg) +{ + wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_scale_cleanup(); + kernel_power_off(); + return -EINVAL; +} + static int __init rcu_scale_init(void) { diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8f08c087142b..b770add3f843 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -241,7 +241,6 @@ static void cblist_init_generic(struct rcu_tasks *rtp) if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; rcu_task_cb_adjust = true; - pr_info("%s: Setting adjustable number of callback queues.\n", __func__); } else if (rcu_task_enqueue_lim == 0) { rcu_task_enqueue_lim = 1; } @@ -272,7 +271,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp) raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); - pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); + + pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, + data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust); } // IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 54963f8c244c..1449cb69a0e0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2778,7 +2778,7 @@ EXPORT_SYMBOL_GPL(call_rcu); */ struct kvfree_rcu_bulk_data { struct list_head list; - unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap; unsigned long nr_records; void *records[]; }; @@ -2795,6 +2795,7 @@ struct kvfree_rcu_bulk_data { * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period + * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ @@ -2802,6 +2803,7 @@ struct kvfree_rcu_bulk_data { struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; + struct rcu_gp_oldstate head_free_gp_snap; struct list_head bulk_head_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; @@ -2922,6 +2924,9 @@ drain_page_cache(struct kfree_rcu_cpu *krcp) struct llist_node *page_list, *pos, *n; int freed = 0; + if (!rcu_min_cached_objs) + return 0; + raw_spin_lock_irqsave(&krcp->lock, flags); page_list = llist_del_all(&krcp->bkvcache); WRITE_ONCE(krcp->nr_bkv_objs, 0); @@ -2942,24 +2947,25 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, unsigned long flags; int i; - debug_rcu_bhead_unqueue(bnode); - - rcu_lock_acquire(&rcu_callback_map); - if (idx == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bnode->nr_records, - bnode->records); - - kfree_bulk(bnode->nr_records, bnode->records); - } else { // vmalloc() / vfree(). - for (i = 0; i < bnode->nr_records; i++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, bnode->records[i], 0); - - vfree(bnode->records[i]); + if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { + debug_rcu_bhead_unqueue(bnode); + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bnode->nr_records, + bnode->records); + + kfree_bulk(bnode->nr_records, bnode->records); + } else { // vmalloc() / vfree(). + for (i = 0; i < bnode->nr_records; i++) { + trace_rcu_invoke_kvfree_callback( + rcu_state.name, bnode->records[i], 0); + + vfree(bnode->records[i]); + } } + rcu_lock_release(&rcu_callback_map); } - rcu_lock_release(&rcu_callback_map); raw_spin_lock_irqsave(&krcp->lock, flags); if (put_cached_bnode(krcp, bnode)) @@ -3006,6 +3012,7 @@ static void kfree_rcu_work(struct work_struct *work) struct rcu_head *head; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + struct rcu_gp_oldstate head_gp_snap; int i; krwp = container_of(to_rcu_work(work), @@ -3020,6 +3027,7 @@ static void kfree_rcu_work(struct work_struct *work) // Channel 3. head = krwp->head_free; krwp->head_free = NULL; + head_gp_snap = krwp->head_free_gp_snap; raw_spin_unlock_irqrestore(&krcp->lock, flags); // Handle the first two channels. @@ -3036,7 +3044,8 @@ static void kfree_rcu_work(struct work_struct *work) * queued on a linked list through their rcu_head structures. * This list is named "Channel 3". */ - kvfree_rcu_list(head); + if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) + kvfree_rcu_list(head); } static bool @@ -3103,7 +3112,7 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) INIT_LIST_HEAD(&bulk_ready[i]); list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { - if (!poll_state_synchronize_rcu(bnode->gp_snap)) + if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) break; atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); @@ -3168,6 +3177,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // objects queued on the linked list. if (!krwp->head_free) { krwp->head_free = krcp->head; + get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); atomic_set(&krcp->head_count, 0); WRITE_ONCE(krcp->head, NULL); } @@ -3216,7 +3226,7 @@ static void fill_page_cache_func(struct work_struct *work) nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? 1 : rcu_min_cached_objs; - for (i = 0; i < nr_pages; i++) { + for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -3240,6 +3250,10 @@ static void fill_page_cache_func(struct work_struct *work) static void run_page_cache_worker(struct kfree_rcu_cpu *krcp) { + // If cache disabled, bail out. + if (!rcu_min_cached_objs) + return; + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { if (atomic_read(&krcp->backoff_page_cache_fill)) { @@ -3294,7 +3308,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, // scenarios. bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - *krcp = krc_this_cpu_lock(flags); + raw_spin_lock_irqsave(&(*krcp)->lock, *flags); } if (!bnode) @@ -3307,7 +3321,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, // Finally insert and update the GP for this page. bnode->records[bnode->nr_records++] = ptr; - bnode->gp_snap = get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&bnode->gp_snap); atomic_inc(&(*krcp)->bulk_count[idx]); return true; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index f2280616f9d5..43229d2b0c44 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1319,13 +1319,22 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) int cpu; unsigned long count = 0; + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; + + /* Protect rcu_nocb_mask against concurrent (de-)offloading. */ + if (!mutex_trylock(&rcu_state.barrier_mutex)) + return 0; + /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, rcu_nocb_mask) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); count += READ_ONCE(rdp->lazy_len); } + mutex_unlock(&rcu_state.barrier_mutex); + return count ? count : SHRINK_EMPTY; } @@ -1336,15 +1345,45 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long flags; unsigned long count = 0; + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; + /* + * Protect against concurrent (de-)offloading. Otherwise nocb locking + * may be ignored or imbalanced. + */ + if (!mutex_trylock(&rcu_state.barrier_mutex)) { + /* + * But really don't insist if barrier_mutex is contended since we + * can't guarantee that it will never engage in a dependency + * chain involving memory allocation. The lock is seldom contended + * anyway. + */ + return 0; + } + /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, rcu_nocb_mask) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int _count = READ_ONCE(rdp->lazy_len); + int _count; + + if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp))) + continue; - if (_count == 0) + if (!READ_ONCE(rdp->lazy_len)) continue; + rcu_nocb_lock_irqsave(rdp, flags); - WRITE_ONCE(rdp->lazy_len, 0); + /* + * Recheck under the nocb lock. Since we are not holding the bypass + * lock we may still race with increments from the enqueuer but still + * we know for sure if there is at least one lazy callback. + */ + _count = READ_ONCE(rdp->lazy_len); + if (!_count) { + rcu_nocb_unlock_irqrestore(rdp, flags); + continue; + } + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp(rdp, false); sc->nr_to_scan -= _count; @@ -1352,6 +1391,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) if (sc->nr_to_scan <= 0) break; } + + mutex_unlock(&rcu_state.barrier_mutex); + return count ? count : SHRINK_STOP; } diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index b52d5069563c..48b9147e8c91 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -250,7 +250,7 @@ identify_qemu_args () { echo -machine virt,gic-version=host -cpu host ;; qemu-system-ppc64) - echo -enable-kvm -M pseries -nodefaults + echo -M pseries -nodefaults echo -device spapr-vscsi if test -n "$TORTURE_QEMU_INTERACTIVE" -a -n "$TORTURE_QEMU_MAC" then diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot index f57720c52c0f..84f6bb98ce99 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot @@ -5,4 +5,4 @@ rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcutree.kthread_prio=2 threadirqs -tree.use_softirq=0 +rcutree.use_softirq=0 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index 64f864f1f361..8e50bfd4b710 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -4,4 +4,4 @@ rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcutree.kthread_prio=2 threadirqs -tree.use_softirq=0 +rcutree.use_softirq=0 |