From 32d99593bdc91442fe6183d97b060210cc9c8193 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 16 Mar 2024 00:10:15 -0700 Subject: rcu: Add lockdep_assert_in_rcu_read_lock() and friends There is no direct RCU counterpart to lockdep_assert_irqs_disabled() and friends. Although it is possible to construct them, it would be more convenient to have the following lockdep assertions: lockdep_assert_in_rcu_read_lock() lockdep_assert_in_rcu_read_lock_bh() lockdep_assert_in_rcu_read_lock_sched() lockdep_assert_in_rcu_reader() This commit therefore creates them. Reported-by: Jens Axboe Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index dfd2399f2cde..8470a85f6563 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -421,11 +421,71 @@ static inline void rcu_preempt_sleep_check(void) { } "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) +// See RCU_LOCKDEP_WARN() for an explanation of the double call to +// debug_lockdep_rcu_enabled(). +static inline bool lockdep_assert_rcu_helper(bool c) +{ + return debug_lockdep_rcu_enabled() && + (c || !rcu_is_watching() || !rcu_lockdep_current_cpu_online()) && + debug_lockdep_rcu_enabled(); +} + +/** + * lockdep_assert_in_rcu_read_lock - WARN if not protected by rcu_read_lock() + * + * Splats if lockdep is enabled and there is no rcu_read_lock() in effect. + */ +#define lockdep_assert_in_rcu_read_lock() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map))) + +/** + * lockdep_assert_in_rcu_read_lock_bh - WARN if not protected by rcu_read_lock_bh() + * + * Splats if lockdep is enabled and there is no rcu_read_lock_bh() in effect. + * Note that local_bh_disable() and friends do not suffice here, instead an + * actual rcu_read_lock_bh() is required. + */ +#define lockdep_assert_in_rcu_read_lock_bh() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_bh_lock_map))) + +/** + * lockdep_assert_in_rcu_read_lock_sched - WARN if not protected by rcu_read_lock_sched() + * + * Splats if lockdep is enabled and there is no rcu_read_lock_sched() + * in effect. Note that preempt_disable() and friends do not suffice here, + * instead an actual rcu_read_lock_sched() is required. + */ +#define lockdep_assert_in_rcu_read_lock_sched() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_sched_lock_map))) + +/** + * lockdep_assert_in_rcu_reader - WARN if not within some type of RCU reader + * + * Splats if lockdep is enabled and there is no RCU reader of any + * type in effect. Note that regions of code protected by things like + * preempt_disable, local_bh_disable(), and local_irq_disable() all qualify + * as RCU readers. + * + * Note that this will never trigger in PREEMPT_NONE or PREEMPT_VOLUNTARY + * kernels that are not also built with PREEMPT_COUNT. But if you have + * lockdep enabled, you might as well also enable PREEMPT_COUNT. + */ +#define lockdep_assert_in_rcu_reader() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map) && \ + !lock_is_held(&rcu_bh_lock_map) && \ + !lock_is_held(&rcu_sched_lock_map) && \ + preemptible())) + #else /* #ifdef CONFIG_PROVE_RCU */ #define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c)) #define rcu_sleep_check() do { } while (0) +#define lockdep_assert_in_rcu_read_lock() do { } while (0) +#define lockdep_assert_in_rcu_read_lock_bh() do { } while (0) +#define lockdep_assert_in_rcu_read_lock_sched() do { } while (0) +#define lockdep_assert_in_rcu_reader() do { } while (0) + #endif /* #else #ifdef CONFIG_PROVE_RCU */ /* -- cgit From ce418966a83320837f59f04a646fa4716e132dc8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 25 Apr 2024 16:18:32 +0200 Subject: rcu/nocb: Fix segcblist state machine comments about bypass The parts explaining the bypass lifecycle in (de-)offloading are out of date and/or wrong. Bypass is simply enabled whenever SEGCBLIST_RCU_CORE flag is off. Fix the comments accordingly. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 659d13a7ddaa..1e5b4ef167ca 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -141,7 +141,7 @@ struct rcu_cblist { * | SEGCBLIST_KTHREAD_GP | * | | * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | - * | handles callbacks concurrently. Bypass enqueue is enabled. | + * | handles callbacks concurrently. Bypass enqueue is disabled. | * | Invoke RCU core so we make sure not to preempt it in the middle with | * | leaving some urgent work unattended within a jiffy. | * ---------------------------------------------------------------------------- @@ -154,8 +154,7 @@ struct rcu_cblist { * | SEGCBLIST_KTHREAD_GP | * | | * | CB/GP kthreads and local rcu_core() handle callbacks concurrently | - * | holding nocb_lock. Wake up CB and GP kthreads if necessary. Disable | - * | bypass enqueue. | + * | holding nocb_lock. Wake up CB and GP kthreads if necessary. | * ---------------------------------------------------------------------------- * | * v @@ -185,7 +184,7 @@ struct rcu_cblist { * | | * | Callbacks processed by rcu_core() from softirqs or local | * | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed. | - * | Flush pending nocb_timer. Flush nocb bypass callbacks. | + * | Flush pending nocb_timer. | * ---------------------------------------------------------------------------- * | * v -- cgit From aa97b9a56906f5965a7c5752790d174cadc8b820 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 25 Apr 2024 16:18:33 +0200 Subject: rcu/nocb: Fix segcblist state machine stale comments about timers The (de-)offloading process used to take care about the NOCB timer when it depended on the per-rdp NOCB locking. However this isn't the case anymore for a long while. It can now safely be armed and run during the (de-)offloading process, which doesn't care about it anymore. Update the comments accordingly. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 1e5b4ef167ca..8018045989af 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -80,8 +80,7 @@ struct rcu_cblist { * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED | * | | * | Callbacks processed by rcu_core() from softirqs or local | - * | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads, | - * | allowing nocb_timer to be armed. | + * | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads. | * ---------------------------------------------------------------------------- * | * v @@ -183,8 +182,7 @@ struct rcu_cblist { * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | * | | * | Callbacks processed by rcu_core() from softirqs or local | - * | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed. | - * | Flush pending nocb_timer. | + * | rcuc kthread, while holding nocb_lock. | * ---------------------------------------------------------------------------- * | * v -- cgit From 483d5bf23125a9127ffcb3f7a3b3539b34df67d4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 25 Apr 2024 16:18:34 +0200 Subject: rcu/nocb: Use kthread parking instead of ad-hoc implementation Upon NOCB deoffloading, the rcuo kthread must be forced to sleep until the corresponding rdp is ever offloaded again. The deoffloader clears the SEGCBLIST_OFFLOADED flag, wakes up the rcuo kthread which then notices that change and clears in turn its SEGCBLIST_KTHREAD_CB flag before going to sleep, until it ever sees the SEGCBLIST_OFFLOADED flag again, should a re-offloading happen. Upon NOCB offloading, the rcuo kthread must be forced to wake up and handle callbacks until the corresponding rdp is ever deoffloaded again. The offloader sets the SEGCBLIST_OFFLOADED flag, wakes up the rcuo kthread which then notices that change and sets in turn its SEGCBLIST_KTHREAD_CB flag before going to check callbacks, until it ever sees the SEGCBLIST_OFFLOADED flag cleared again, should a de-offloading happen again. This is all a crude ad-hoc and error-prone kthread (un-)parking re-implementation. Consolidate the behaviour with the appropriate API instead. [ paulmck: Apply Qiang Zhang feedback provided in Link: below. ] Link: https://lore.kernel.org/all/20240509074046.15629-1-qiang.zhang1211@gmail.com/ Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 81 +++++++++++++---------------- kernel/rcu/tree_nocb.h | 115 +++++++++++++----------------------------- kernel/rcu/tree_plugin.h | 4 +- 3 files changed, 72 insertions(+), 128 deletions(-) diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 8018045989af..ba95c06675e1 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -84,31 +84,31 @@ struct rcu_cblist { * ---------------------------------------------------------------------------- * | * v - * ----------------------------------- - * | | - * v v - * --------------------------------------- ----------------------------------| - * | SEGCBLIST_RCU_CORE | | | SEGCBLIST_RCU_CORE | | - * | SEGCBLIST_LOCKING | | | SEGCBLIST_LOCKING | | - * | SEGCBLIST_OFFLOADED | | | SEGCBLIST_OFFLOADED | | - * | SEGCBLIST_KTHREAD_CB | | SEGCBLIST_KTHREAD_GP | - * | | | | - * | | | | - * | CB kthread woke up and | | GP kthread woke up and | - * | acknowledged SEGCBLIST_OFFLOADED. | | acknowledged SEGCBLIST_OFFLOADED| - * | Processes callbacks concurrently | | | - * | with rcu_core(), holding | | | - * | nocb_lock. | | | - * --------------------------------------- ----------------------------------- - * | | - * ----------------------------------- + * ---------------------------------------------------------------------------- + * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED | + * | + unparked CB kthread | + * | | + * | CB kthread got unparked and processes callbacks concurrently with | + * | rcu_core(), holding nocb_lock. | + * --------------------------------------------------------------------------- + * | + * v + * ---------------------------------------------------------------------------| + * | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | + * | | + * | GP kthread woke up and acknowledged nocb_lock. | + * ---------------------------------------- ----------------------------------- * | * v * |--------------------------------------------------------------------------| - * | SEGCBLIST_LOCKING | | - * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | * | SEGCBLIST_KTHREAD_GP | | - * | SEGCBLIST_KTHREAD_CB | + * | + unparked CB kthread | * | | * | Kthreads handle callbacks holding nocb_lock, local rcu_core() stops | * | handling callbacks. Enable bypass queueing. | @@ -124,8 +124,8 @@ struct rcu_cblist { * |--------------------------------------------------------------------------| * | SEGCBLIST_LOCKING | | * | SEGCBLIST_OFFLOADED | | - * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | * | | * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | * | ignores callbacks. Bypass enqueue is enabled. | @@ -136,8 +136,8 @@ struct rcu_cblist { * | SEGCBLIST_RCU_CORE | | * | SEGCBLIST_LOCKING | | * | SEGCBLIST_OFFLOADED | | - * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | * | | * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | * | handles callbacks concurrently. Bypass enqueue is disabled. | @@ -149,40 +149,31 @@ struct rcu_cblist { * |--------------------------------------------------------------------------| * | SEGCBLIST_RCU_CORE | | * | SEGCBLIST_LOCKING | | - * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | * | | * | CB/GP kthreads and local rcu_core() handle callbacks concurrently | - * | holding nocb_lock. Wake up CB and GP kthreads if necessary. | + * | holding nocb_lock. Wake up GP kthread if necessary. | * ---------------------------------------------------------------------------- * | * v - * ----------------------------------- - * | | - * v v - * ---------------------------------------------------------------------------| - * | | | - * | SEGCBLIST_RCU_CORE | | SEGCBLIST_RCU_CORE | | - * | SEGCBLIST_LOCKING | | SEGCBLIST_LOCKING | | - * | SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | - * | | | - * | GP kthread woke up and | CB kthread woke up and | - * | acknowledged the fact that | acknowledged the fact that | - * | SEGCBLIST_OFFLOADED got cleared. | SEGCBLIST_OFFLOADED got cleared. | - * | | The CB kthread goes to sleep | - * | The callbacks from the target CPU | until it ever gets re-offloaded. | - * | will be ignored from the GP kthread | | - * | loop. | | + * |--------------------------------------------------------------------------| + * | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | + * | + unparked CB kthread | + * | | + * | GP kthread woke up and acknowledged the fact that SEGCBLIST_OFFLOADED | + * | got cleared. The callbacks from the target CPU will be ignored from the| + * | GP kthread loop. | * ---------------------------------------------------------------------------- - * | | - * ----------------------------------- * | * v * ---------------------------------------------------------------------------- * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | + * | + parked CB kthread | * | | - * | Callbacks processed by rcu_core() from softirqs or local | - * | rcuc kthread, while holding nocb_lock. | + * | CB kthread is parked. Callbacks processed by rcu_core() from softirqs or | + * | local rcuc kthread, while holding nocb_lock. | * ---------------------------------------------------------------------------- * | * v diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 3f85577bddd4..808c9a19fe1d 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -635,8 +635,7 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, } } -static int nocb_gp_toggle_rdp(struct rcu_data *rdp, - bool *wake_state) +static int nocb_gp_toggle_rdp(struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; @@ -650,8 +649,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp, * We will handle this rdp until it ever gets de-offloaded. */ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *wake_state = true; ret = 1; } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { @@ -660,8 +657,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp, * We will ignore this rdp until it ever gets re-offloaded. */ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *wake_state = true; ret = 0; } else { WARN_ON_ONCE(1); @@ -877,16 +872,15 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (rdp_toggling) { - bool wake_state = false; int ret; - ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state); + ret = nocb_gp_toggle_rdp(rdp_toggling); if (ret == 1) list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); else if (ret == 0) list_del(&rdp_toggling->nocb_entry_rdp); - if (wake_state) - swake_up_one(&rdp_toggling->nocb_state_wq); + + swake_up_one(&rdp_toggling->nocb_state_wq); } my_rdp->nocb_gp_seq = -1; @@ -913,16 +907,9 @@ static int rcu_nocb_gp_kthread(void *arg) return 0; } -static inline bool nocb_cb_can_run(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) { - return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); + return !READ_ONCE(rdp->nocb_cb_sleep) || kthread_should_park(); } /* @@ -934,21 +921,19 @@ static void nocb_cb_wait(struct rcu_data *rdp) struct rcu_segcblist *cblist = &rdp->cblist; unsigned long cur_gp_seq; unsigned long flags; - bool needwake_state = false; bool needwake_gp = false; - bool can_sleep = true; struct rcu_node *rnp = rdp->mynode; - do { - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - nocb_cb_wait_cond(rdp)); - - if (READ_ONCE(rdp->nocb_cb_sleep)) { - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - } - } while (!nocb_cb_can_run(rdp)); + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + if (kthread_should_park()) { + kthread_parkme(); + } else if (READ_ONCE(rdp->nocb_cb_sleep)) { + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); local_irq_save(flags); rcu_momentary_dyntick_idle(); @@ -971,37 +956,16 @@ static void nocb_cb_wait(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; - } - if (rcu_segcblist_ready_cbs(cblist)) - can_sleep = false; + if (!rcu_segcblist_ready_cbs(cblist)) { + WRITE_ONCE(rdp->nocb_cb_sleep, true); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); } else { - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We won't touch the callbacks and keep sleeping until we ever - * get re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; + WRITE_ONCE(rdp->nocb_cb_sleep, false); } - WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); - - if (rdp->nocb_cb_sleep) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); - rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) rcu_gp_kthread_wake(); - - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); } /* @@ -1094,17 +1058,8 @@ static int rdp_offload_toggle(struct rcu_data *rdp, bool wake_gp = false; rcu_segcblist_offload(cblist, offload); - - if (rdp->nocb_cb_sleep) - rdp->nocb_cb_sleep = false; rcu_nocb_unlock_irqrestore(rdp, flags); - /* - * Ignore former value of nocb_cb_sleep and force wake up as it could - * have been spuriously set to false already. - */ - swake_up_one(&rdp->nocb_cb_wq); - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); // Queue this rdp for add/del to/from the list to iterate on rcuog WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp); @@ -1161,19 +1116,11 @@ static long rcu_nocb_rdp_deoffload(void *arg) if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); - /* - * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB. - * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog. - */ - if (!rdp->nocb_cb_kthread) { - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); - rcu_nocb_unlock_irqrestore(rdp, flags); - } - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, - SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + !rcu_segcblist_test_flags(cblist, + SEGCBLIST_KTHREAD_GP)); + if (rdp->nocb_cb_kthread) + kthread_park(rdp->nocb_cb_kthread); } else { /* * No kthread to clear the flags for us or remove the rdp from the nocb list @@ -1181,8 +1128,7 @@ static long rcu_nocb_rdp_deoffload(void *arg) * but we stick to paranoia in this rare path. */ rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, - SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); rcu_nocb_unlock_irqrestore(rdp, flags); list_del(&rdp->nocb_entry_rdp); @@ -1282,8 +1228,10 @@ static long rcu_nocb_rdp_offload(void *arg) wake_gp = rdp_offload_toggle(rdp, true, flags); if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); + + kthread_unpark(rdp->nocb_cb_kthread); + swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); /* @@ -1468,7 +1416,7 @@ void __init rcu_init_nohz(void) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); } rcu_organize_nocb_kthreads(); @@ -1526,11 +1474,16 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_cb_kthread, rdp, - "rcuo%c/%d", rcu_state.abbr, cpu); + t = kthread_create(rcu_nocb_cb_kthread, rdp, + "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) goto end; + if (rcu_rdp_is_offloaded(rdp)) + wake_up_process(t); + else + kthread_park(t); + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 340bbefe5f65..1facbb365dc2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -28,8 +28,8 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) !(lockdep_is_held(&rcu_state.barrier_mutex) || (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || rcu_lockdep_is_held_nocb(rdp) || - (rdp == this_cpu_ptr(&rcu_data) && - !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) || + (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && + rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), "Unsafe read of RCU_NOCB offloaded state" ); -- cgit From e4f78057291608f6968a6789c5ebb3bde7d95504 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 25 Apr 2024 16:18:35 +0200 Subject: rcu/nocb: Remove buggy bypass lock contention mitigation The bypass lock contention mitigation assumes there can be at most 2 contenders on the bypass lock, following this scheme: 1) One kthread takes the bypass lock 2) Another one spins on it and increment the contended counter 3) A third one (a bypass enqueuer) sees the contended counter on and busy loops waiting on it to decrement. However this assumption is wrong. There can be only one CPU to find the lock contended because call_rcu() (the bypass enqueuer) is the only bypass lock acquire site that may not already hold the NOCB lock beforehand, all the other sites must first contend on the NOCB lock. Therefore step 2) is impossible. The other problem is that the mitigation assumes that contenders all belong to the same rdp CPU, which is also impossible for a raw spinlock. In theory the warning could trigger if the enqueuer holds the bypass lock and another CPU flushes the bypass queue concurrently but this is prevented from all flush users: 1) NOCB kthreads only flush if they successfully _tried_ to lock the bypass lock. So no contention management here. 2) Flush on callbacks migration happen remotely when the CPU is offline. No concurrency against bypass enqueue. 3) Flush on deoffloading happen either locally with IRQs disabled or remotely when the CPU is not yet online. No concurrency against bypass enqueue. 4) Flush on barrier entrain happen either locally with IRQs disabled or remotely when the CPU is offline. No concurrency against bypass enqueue. For those reasons, the bypass lock contention mitigation isn't needed and is even wrong. Remove it but keep the warning reporting a contended bypass lock on a remote CPU, to keep unexpected contention awareness. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_nocb.h | 32 ++++++-------------------------- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bae7925c497f..179f60ca0313 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -223,7 +223,6 @@ struct rcu_data { struct swait_queue_head nocb_state_wq; /* For offloading state changes */ struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ - atomic_t nocb_lock_contended; /* Contention experienced. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 808c9a19fe1d..3ce30841119a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -91,8 +91,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0); /* * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the - * lock isn't immediately available, increment ->nocb_lock_contended to - * flag the contention. + * lock isn't immediately available, perform minimal sanity check. */ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) __acquires(&rdp->nocb_bypass_lock) @@ -100,29 +99,12 @@ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) lockdep_assert_irqs_disabled(); if (raw_spin_trylock(&rdp->nocb_bypass_lock)) return; - atomic_inc(&rdp->nocb_lock_contended); + /* + * Contention expected only when local enqueue collide with + * remote flush from kthreads. + */ WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - smp_mb__after_atomic(); /* atomic_inc() before lock. */ raw_spin_lock(&rdp->nocb_bypass_lock); - smp_mb__before_atomic(); /* atomic_dec() after lock. */ - atomic_dec(&rdp->nocb_lock_contended); -} - -/* - * Spinwait until the specified rcu_data structure's ->nocb_lock is - * not contended. Please note that this is extremely special-purpose, - * relying on the fact that at most two kthreads and one CPU contend for - * this lock, and also that the two kthreads are guaranteed to have frequent - * grace-period-duration time intervals between successive acquisitions - * of the lock. This allows us to use an extremely simple throttling - * mechanism, and further to apply it only to the CPU doing floods of - * call_rcu() invocations. Don't try this at home! - */ -static void rcu_nocb_wait_contended(struct rcu_data *rdp) -{ - WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) - cpu_relax(); } /* @@ -510,7 +492,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, } // We need to use the bypass. - rcu_nocb_wait_contended(rdp); rcu_nocb_bypass_lock(rdp); ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ @@ -1631,12 +1612,11 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], - "cC"[!!atomic_read(&rdp->nocb_lock_contended)], "lL"[raw_spin_is_locked(&rdp->nocb_lock)], "sS"[!!rdp->nocb_cb_sleep], ".W"[swait_active(&rdp->nocb_cb_wq)], -- cgit From 9855c37edf0009cc276cecfee09f7e76e2380212 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 25 Apr 2024 16:24:04 +0200 Subject: Revert "rcu-tasks: Fix synchronize_rcu_tasks() VS zap_pid_ns_processes()" This reverts commit 28319d6dc5e2ffefa452c2377dd0f71621b5bff0. The race it fixed was subject to conditions that don't exist anymore since: 1612160b9127 ("rcu-tasks: Eliminate deadlocks involving do_exit() and RCU tasks") This latter commit removes the use of SRCU that used to cover the RCU-tasks blind spot on exit between the tasklist's removal and the final preemption disabling. The task is now placed instead into a temporary list inside which voluntary sleeps are accounted as RCU-tasks quiescent states. This would disarm the deadlock initially reported against PID namespace exit. Signed-off-by: Frederic Weisbecker Reviewed-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 -- kernel/pid_namespace.c | 17 ----------------- kernel/rcu/tasks.h | 16 +++------------- 3 files changed, 3 insertions(+), 32 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index dfd2399f2cde..61cb3de236af 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -209,7 +209,6 @@ void synchronize_rcu_tasks_rude(void); #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); -void exit_tasks_rcu_stop(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_classic_qs(t, preempt) do { } while (0) @@ -218,7 +217,6 @@ void exit_tasks_rcu_finish(void); #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } -static inline void exit_tasks_rcu_stop(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index dc48fecfa1dc..b566ae827cfc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -248,24 +248,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->pid_allocated == init_pids) break; - /* - * Release tasks_rcu_exit_srcu to avoid following deadlock: - * - * 1) TASK A unshare(CLONE_NEWPID) - * 2) TASK A fork() twice -> TASK B (child reaper for new ns) - * and TASK C - * 3) TASK B exits, kills TASK C, waits for TASK A to reap it - * 4) TASK A calls synchronize_rcu_tasks() - * -> synchronize_srcu(tasks_rcu_exit_srcu) - * 5) *DEADLOCK* - * - * It is considered safe to release tasks_rcu_exit_srcu here - * because we assume the current task can not be concurrently - * reaped at this point. - */ - exit_tasks_rcu_stop(); schedule(); - exit_tasks_rcu_start(); } __set_current_state(TASK_RUNNING); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index e1bf33018e6d..4dc56b6e27c0 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -858,7 +858,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // not know to synchronize with this RCU Tasks grace period) have // completed exiting. The synchronize_rcu() in rcu_tasks_postgp() // will take care of any tasks stuck in the non-preemptible region -// of do_exit() following its call to exit_tasks_rcu_stop(). +// of do_exit() following its call to exit_tasks_rcu_finish(). // check_all_holdout_tasks(), repeatedly until holdout list is empty: // Scans the holdout list, attempting to identify a quiescent state // for each task on the list. If there is a quiescent state, the @@ -1220,7 +1220,7 @@ void exit_tasks_rcu_start(void) * Remove the task from the "yet another list" because do_exit() is now * non-preemptible, allowing synchronize_rcu() to wait beyond this point. */ -void exit_tasks_rcu_stop(void) +void exit_tasks_rcu_finish(void) { unsigned long flags; struct rcu_tasks_percpu *rtpcp; @@ -1231,22 +1231,12 @@ void exit_tasks_rcu_stop(void) raw_spin_lock_irqsave_rcu_node(rtpcp, flags); list_del_init(&t->rcu_tasks_exit_list); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); -} -/* - * Contribute to protect against tasklist scan blind spot while the - * task is exiting and may be removed from the tasklist. See - * corresponding synchronize_srcu() for further details. - */ -void exit_tasks_rcu_finish(void) -{ - exit_tasks_rcu_stop(); - exit_tasks_rcu_finish_trace(current); + exit_tasks_rcu_finish_trace(t); } #else /* #ifdef CONFIG_TASKS_RCU */ void exit_tasks_rcu_start(void) { } -void exit_tasks_rcu_stop(void) { } void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ -- cgit From 43b39cafbaf66cbbdace321c01701154a1c05fdc Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sun, 7 Apr 2024 19:27:14 +0800 Subject: rcutorture: Make rcutorture support srcu double call test This commit allows rcutorture to test double-call_srcu() when the CONFIG_DEBUG_OBJECTS_RCU_HEAD Kconfig option is enabled. The non-raw sdp structure's ->spinlock will be acquired in call_srcu(), hence this commit also removes the current IRQ and preemption disabling so as to avoid lockdep complaints. Link: https://lore.kernel.org/all/20240407112714.24460-1-qiang.zhang1211@gmail.com/ Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 807fbf6123a7..44cc455e1b61 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -390,6 +390,7 @@ struct rcu_torture_ops { int extendables; int slow_gps; int no_pi_lock; + int debug_objects; const char *name; }; @@ -577,6 +578,7 @@ static struct rcu_torture_ops rcu_ops = { .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, + .debug_objects = 1, .name = "rcu" }; @@ -747,6 +749,7 @@ static struct rcu_torture_ops srcu_ops = { .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, .name = "srcu" }; @@ -786,6 +789,7 @@ static struct rcu_torture_ops srcud_ops = { .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, .name = "srcud" }; @@ -3455,7 +3459,6 @@ rcu_torture_cleanup(void) cur_ops->gp_slow_unregister(NULL); } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD static void rcu_torture_leak_cb(struct rcu_head *rhp) { } @@ -3473,7 +3476,6 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); } -#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ /* * Verify that double-free causes debug-objects to complain, but only @@ -3482,39 +3484,43 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ static void rcu_test_debug_objects(void) { -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD struct rcu_head rh1; struct rcu_head rh2; + int idx; + + if (!IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) { + pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_%s()\n", + KBUILD_MODNAME, cur_ops->name); + return; + } + + if (WARN_ON_ONCE(cur_ops->debug_objects && + (!cur_ops->call || !cur_ops->cb_barrier))) + return; + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); - pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); + pr_alert("%s: WARN: Duplicate call_%s() test starting.\n", KBUILD_MODNAME, cur_ops->name); /* Try to queue the rh2 pair of callbacks for the same grace period. */ - preempt_disable(); /* Prevent preemption from interrupting test. */ - rcu_read_lock(); /* Make it impossible to finish a grace period. */ - call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */ - local_irq_disable(); /* Make it harder to start a new grace period. */ - call_rcu_hurry(&rh2, rcu_torture_leak_cb); - call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + idx = cur_ops->readlock(); /* Make it impossible to finish a grace period. */ + cur_ops->call(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + cur_ops->call(&rh2, rcu_torture_leak_cb); + cur_ops->call(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ if (rhp) { - call_rcu_hurry(rhp, rcu_torture_leak_cb); - call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ + cur_ops->call(rhp, rcu_torture_leak_cb); + cur_ops->call(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ } - local_irq_enable(); - rcu_read_unlock(); - preempt_enable(); + cur_ops->readunlock(idx); /* Wait for them all to get done so we can safely return. */ - rcu_barrier(); - pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); + cur_ops->cb_barrier(); + pr_alert("%s: WARN: Duplicate call_%s() test complete.\n", KBUILD_MODNAME, cur_ops->name); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); kfree(rhp); -#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); -#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } static void rcutorture_sync(void) -- cgit From 6040072f4774a575fa67b912efe7722874be337b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Apr 2024 12:02:11 -0700 Subject: rcutorture: Fix rcu_torture_fwd_cb_cr() data race On powerpc systems, spinlock acquisition does not order prior stores against later loads. This means that this statement: rfcp->rfc_next = NULL; Can be reordered to follow this statement: WRITE_ONCE(*rfcpp, rfcp); Which is then a data race with rcu_torture_fwd_prog_cr(), specifically, this statement: rfcpn = READ_ONCE(rfcp->rfc_next) KCSAN located this data race, which represents a real failure on powerpc. Signed-off-by: Paul E. McKenney Acked-by: Marco Elver Cc: Andrey Konovalov Cc: --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 44cc455e1b61..cafe047d046e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2630,7 +2630,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); rfcpp = rfp->rcu_fwd_cb_tail; rfp->rcu_fwd_cb_tail = &rfcp->rfc_next; - WRITE_ONCE(*rfcpp, rfcp); + smp_store_release(rfcpp, rfcp); WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1); i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); if (i >= ARRAY_SIZE(rfp->n_launders_hist)) -- cgit From b9f147cdc2c0bf54ca2c25ed185806f1fc6da65f Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sat, 18 May 2024 16:28:08 -0700 Subject: rcutorture: Add missing MODULE_DESCRIPTION() macros Fix the following 'make W=1' warnings: WARNING: modpost: missing MODULE_DESCRIPTION() in kernel/rcu/rcutorture.o WARNING: modpost: missing MODULE_DESCRIPTION() in kernel/rcu/rcuscale.o WARNING: modpost: missing MODULE_DESCRIPTION() in kernel/rcu/refscale.o Signed-off-by: Jeff Johnson Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuscale.c | 1 + kernel/rcu/rcutorture.c | 1 + kernel/rcu/refscale.c | 1 + 3 files changed, 3 insertions(+) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 8db4fedaaa1e..b53a9e8f5904 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -42,6 +42,7 @@ #include "rcu.h" +MODULE_DESCRIPTION("Read-Copy Update module-based scalability-test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index cafe047d046e..08bf7c669dd3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -51,6 +51,7 @@ #include "rcu.h" +MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 2c2648a3ad30..f4ea5b1ec068 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -63,6 +63,7 @@ do { \ #define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x) +MODULE_DESCRIPTION("Scalability test for object reference mechanisms"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Joel Fernandes (Google) "); -- cgit From 0ac55d095d375e84fcdac5e51011613734e57854 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 May 2024 21:43:56 -0700 Subject: tools/rcu: Add rcu-updaters.sh script This commit adds a tools/rcu/rcu-updaters.sh script that uses bpftrace to print a histogram of the RCU update-side primitives invoked during the specified time interval, or until manually terminated if no interval is specified. Sample output on an idle laptop: @counts[poll_state_synchronize_rcu]: 6 @counts[synchronize_srcu]: 13 @counts[call_rcu_tasks_trace]: 25 @counts[synchronize_rcu]: 54 @counts[kvfree_call_rcu]: 428 @counts[call_rcu]: 2134 Note that when run on a kernel missing one or more of the symbols, this script will issue a diagnostic for each that is not found, but continue normally for the rest of the functions. Signed-off-by: Paul E. McKenney --- tools/rcu/rcu-updaters.sh | 52 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100755 tools/rcu/rcu-updaters.sh diff --git a/tools/rcu/rcu-updaters.sh b/tools/rcu/rcu-updaters.sh new file mode 100755 index 000000000000..4ef1397927bb --- /dev/null +++ b/tools/rcu/rcu-updaters.sh @@ -0,0 +1,52 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Run bpftrace to obtain a histogram of the types of primitives used to +# initiate RCU grace periods. The count associated with rcu_gp_init() +# is the number of normal (non-expedited) grace periods. +# +# Usage: rcu-updaters.sh [ duration-in-seconds ] +# +# Note that not all kernel builds have all of these functions. In those +# that do not, this script will issue a diagnostic for each that is not +# found, but continue normally for the rest of the functions. + +duration=${1} +if test -n "${duration}" +then + exitclause='interval:s:'"${duration}"' { exit(); }' +else + echo 'Hit control-C to end sample and print results.' +fi +bpftrace -e 'kprobe:kvfree_call_rcu, + kprobe:call_rcu, + kprobe:call_rcu_tasks, + kprobe:call_rcu_tasks_rude, + kprobe:call_rcu_tasks_trace, + kprobe:call_srcu, + kprobe:rcu_barrier, + kprobe:rcu_barrier_tasks, + kprobe:rcu_barrier_tasks_rude, + kprobe:rcu_barrier_tasks_trace, + kprobe:srcu_barrier, + kprobe:synchronize_rcu, + kprobe:synchronize_rcu_expedited, + kprobe:synchronize_rcu_tasks, + kprobe:synchronize_rcu_tasks_rude, + kprobe:synchronize_rcu_tasks_trace, + kprobe:synchronize_srcu, + kprobe:synchronize_srcu_expedited, + kprobe:get_state_synchronize_rcu, + kprobe:get_state_synchronize_rcu_full, + kprobe:start_poll_synchronize_rcu, + kprobe:start_poll_synchronize_rcu_expedited, + kprobe:start_poll_synchronize_rcu_full, + kprobe:start_poll_synchronize_rcu_expedited_full, + kprobe:poll_state_synchronize_rcu, + kprobe:poll_state_synchronize_rcu_full, + kprobe:cond_synchronize_rcu, + kprobe:cond_synchronize_rcu_full, + kprobe:start_poll_synchronize_srcu, + kprobe:poll_state_synchronize_srcu, + kprobe:rcu_gp_init + { @counts[func] = count(); } '"${exitclause}" -- cgit From 399ced9594dfab51b782798efe60a2376cd5b724 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 May 2024 17:23:02 +0200 Subject: rcu/tasks: Fix stale task snaphot for Tasks Trace When RCU-TASKS-TRACE pre-gp takes a snapshot of the current task running on all online CPUs, no explicit ordering synchronizes properly with a context switch. This lack of ordering can permit the new task to miss pre-grace-period update-side accesses. The following diagram, courtesy of Paul, shows the possible bad scenario: CPU 0 CPU 1 ----- ----- // Pre-GP update side access WRITE_ONCE(*X, 1); smp_mb(); r0 = rq->curr; RCU_INIT_POINTER(rq->curr, TASK_B) spin_unlock(rq) rcu_read_lock_trace() r1 = X; /* ignore TASK_B */ Either r0==TASK_B or r1==1 is needed but neither is guaranteed. One possible solution to solve this is to wait for an RCU grace period at the beginning of the RCU-tasks-trace grace period before taking the current tasks snaphot. However this would introduce large additional latencies to RCU-tasks-trace grace periods. Another solution is to lock the target runqueue while taking the current task snapshot. This ensures that the update side sees the latest context switch and subsequent context switches will see the pre-grace-period update side accesses. This commit therefore adds runqueue locking to cpu_curr_snapshot(). Fixes: e386b6725798 ("rcu-tasks: Eliminate RCU Tasks Trace IPIs to online CPUs") Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 10 ++++++++++ kernel/sched/core.c | 14 +++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4dc56b6e27c0..ba3440a45b6d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1747,6 +1747,16 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) // allow safe access to the hop list. for_each_online_cpu(cpu) { rcu_read_lock(); + // Note that cpu_curr_snapshot() picks up the target + // CPU's current task while its runqueue is locked with + // an smp_mb__after_spinlock(). This ensures that either + // the grace-period kthread will see that task's read-side + // critical section or the task will see the updater's pre-GP + // accesses. The trailing smp_mb() in cpu_curr_snapshot() + // does not currently play a role other than simplify + // that function's ordering semantics. If these simplified + // ordering semantics continue to be redundant, that smp_mb() + // might be removed. t = cpu_curr_snapshot(cpu); if (rcu_tasks_trace_pertask_prep(t, true)) trc_add_holdout(t, hop); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bcf2c4cc0522..05afa2932b5e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4467,12 +4467,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) * @cpu: The CPU on which to snapshot the task. * * Returns the task_struct pointer of the task "currently" running on - * the specified CPU. If the same task is running on that CPU throughout, - * the return value will be a pointer to that task's task_struct structure. - * If the CPU did any context switches even vaguely concurrently with the - * execution of this function, the return value will be a pointer to the - * task_struct structure of a randomly chosen task that was running on - * that CPU somewhere around the time that this function was executing. + * the specified CPU. * * If the specified CPU was offline, the return value is whatever it * is, perhaps a pointer to the task_struct structure of that CPU's idle @@ -4486,11 +4481,16 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) */ struct task_struct *cpu_curr_snapshot(int cpu) { + struct rq *rq = cpu_rq(cpu); struct task_struct *t; + struct rq_flags rf; - smp_mb(); /* Pairing determined by caller's synchronization design. */ + rq_lock_irqsave(rq, &rf); + smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */ t = rcu_dereference(cpu_curr(cpu)); + rq_unlock_irqrestore(rq, &rf); smp_mb(); /* Pairing determined by caller's synchronization design. */ + return t; } -- cgit From 6f948568fdc66429c9f70e2cecde0664655cc870 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 19 Mar 2024 14:54:57 -0400 Subject: rcu/tree: Reduce wake up for synchronize_rcu() common case In the synchronize_rcu() common case, we will have less than SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker is pointless just to free the last injected wait head since at that point, all the users have already been awakened. Introduce a new counter to track this and prevent the wakeup in the common case. [ paulmck: Remove atomic_dec_return_release in cannot-happen state. ] Signed-off-by: Joel Fernandes (Google) Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 ++++++++++++++++++++++++++---- kernel/rcu/tree.h | 1 + 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 28c7031711a3..ecd57940fc88 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = { .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, rcu_sr_normal_gp_cleanup_work), + .srs_cleanups_pending = ATOMIC_INIT(0), }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -1660,6 +1661,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) rcu_sr_put_wait_head(rcu); } + + /* Order list manipulations with atomic access. */ + atomic_dec_return_release(&rcu_state.srs_cleanups_pending); } /* @@ -1667,7 +1671,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) */ static void rcu_sr_normal_gp_cleanup(void) { - struct llist_node *wait_tail, *next, *rcu; + struct llist_node *wait_tail, *next = NULL, *rcu = NULL; int done = 0; wait_tail = rcu_state.srs_wait_tail; @@ -1693,16 +1697,34 @@ static void rcu_sr_normal_gp_cleanup(void) break; } - // concurrent sr_normal_gp_cleanup work might observe this update. - smp_store_release(&rcu_state.srs_done_tail, wait_tail); + /* + * Fast path, no more users to process except putting the second last + * wait head if no inflight-workers. If there are in-flight workers, + * they will remove the last wait head. + * + * Note that the ACQUIRE orders atomic access with list manipulation. + */ + if (wait_tail->next && wait_tail->next->next == NULL && + rcu_sr_is_wait_head(wait_tail->next) && + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) { + rcu_sr_put_wait_head(wait_tail->next); + wait_tail->next = NULL; + } + + /* Concurrent sr_normal_gp_cleanup work might observe this update. */ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail); + smp_store_release(&rcu_state.srs_done_tail, wait_tail); /* * We schedule a work in order to perform a final processing * of outstanding users(if still left) and releasing wait-heads * added by rcu_sr_normal_gp_init() call. */ - queue_work(sync_wq, &rcu_state.srs_cleanup_work); + if (wait_tail->next) { + atomic_inc(&rcu_state.srs_cleanups_pending); + if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work)) + atomic_dec(&rcu_state.srs_cleanups_pending); + } } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bae7925c497f..affcb92a358c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -420,6 +420,7 @@ struct rcu_state { struct llist_node *srs_done_tail; /* ready for GP users. */ struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; struct work_struct srs_cleanup_work; + atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ }; /* Values for rcu_state structure's gp_flags field. */ -- cgit From 51cace13729f8954e5414bea9dbfaf80ebd41a69 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 May 2024 10:28:43 -0700 Subject: rcu: Disable interrupts directly in rcu_gp_init() Interrupts are enabled in rcu_gp_init(), so this commit switches from local_irq_save() and local_irq_restore() to local_irq_disable() and local_irq_enable(). Link: https://lore.kernel.org/all/febb13ab-a4bb-48b4-8e97-7e9f7749e6da@moroto.mountain/ Reported-by: Dan Carpenter Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ecd57940fc88..18eba61a6f8b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1832,7 +1832,7 @@ static noinline_for_stack bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { - local_irq_save(flags); + local_irq_disable(); arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1840,7 +1840,7 @@ static noinline_for_stack bool rcu_gp_init(void) /* Nothing to do on this leaf rcu_node structure. */ raw_spin_unlock_rcu_node(rnp); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + local_irq_enable(); continue; } @@ -1877,7 +1877,7 @@ static noinline_for_stack bool rcu_gp_init(void) raw_spin_unlock_rcu_node(rnp); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + local_irq_enable(); } rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ -- cgit From 4b56b0f5d50c01ffb1372183f5c55e09764ca90e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 May 2024 16:47:16 -0700 Subject: srcu: Disable interrupts directly in srcu_gp_end() Interrupts are enabled in srcu_gp_end(), so this commit switches from spin_lock_irqsave_rcu_node() and spin_unlock_irqrestore_rcu_node() to spin_lock_irq_rcu_node() and spin_unlock_irq_rcu_node(). Link: https://lore.kernel.org/all/febb13ab-a4bb-48b4-8e97-7e9f7749e6da@moroto.mountain/ Reported-by: Dan Carpenter Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index bc4b58b0204e..d14d350f505f 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -845,7 +845,6 @@ static void srcu_gp_end(struct srcu_struct *ssp) bool cbs; bool last_lvl; int cpu; - unsigned long flags; unsigned long gpseq; int idx; unsigned long mask; @@ -907,12 +906,12 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (!(gpseq & counter_wrap_check)) for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); + spin_lock_irq_rcu_node(sdp); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irq_rcu_node(sdp); } /* Callback initiation done, allow grace periods after next. */ -- cgit From 395e73bd8d35fa9b8505e44f63a2a10abde09cce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 9 Jun 2024 20:12:19 -0700 Subject: srcu: Add NUM_ACTIVE_SRCU_POLL_OLDSTATE This commit adds NUM_ACTIVE_SRCU_POLL_OLDSTATE, which gives the maximum number of distinct return values from get_state_synchronize_rcu() that can, at a given point in time, correspond to not-completed SRCU grace periods. Reported-by: Kent Overstreet Closes: https://lore.kernel.org/all/irycqy4sinjdgm2hkyix2bffunpcmuwgeufsx6nlljvqme3wiu@ify3zdnrmzph/ Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 236610e4a8fa..f664cba7a80c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -61,6 +61,10 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); +// Maximum number of unsigned long values corresponding to +// not-yet-completed SRCU grace periods. +#define NUM_ACTIVE_SRCU_POLL_OLDSTATE 2 + #ifdef CONFIG_NEED_SRCU_NMI_SAFE int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp); -- cgit From d7b0615cb8d24737df436c9c00788c0f35e02c7b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Jun 2024 08:14:08 -0700 Subject: srcu: Update cleanup_srcu_struct() comment Now that we have polled SRCU grace periods, a grace period can be started by start_poll_synchronize_srcu() as well as call_srcu(), synchronize_srcu(), and synchronize_srcu_expedited(). This commit therefore calls out this new start_poll_synchronize_srcu() possibility in the comment on the WARN_ON(). Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index bc4b58b0204e..15dc22a8ff5a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -667,7 +667,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)), rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed); - return; /* Caller forgot to stop doing call_srcu()? */ + return; // Caller forgot to stop doing call_srcu()? + // Or caller invoked start_poll_synchronize_srcu() + // and then cleanup_srcu_struct() before that grace + // period ended? } kfree(sup->node); sup->node = NULL; -- cgit From e206f33e2c0774276a0497fe538472e12016a362 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Jun 2024 13:26:44 -0700 Subject: srcu: Fill out polled grace-period APIs This commit adds the get_completed_synchronize_srcu() and the same_state_synchronize_srcu() functions. The first returns a cookie that is always interpreted as corresponding to an expired grace period. The second does an equality comparison of a pair of cookies. Signed-off-by: Paul E. McKenney Cc: Kent Overstreet --- include/linux/srcu.h | 31 +++++++++++++++++++++++++++++++ kernel/rcu/srcutiny.c | 3 ++- kernel/rcu/srcutree.c | 3 ++- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f664cba7a80c..6f6cb5fc1242 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -57,6 +57,20 @@ void cleanup_srcu_struct(struct srcu_struct *ssp); int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); void synchronize_srcu(struct srcu_struct *ssp); + +#define SRCU_GET_STATE_COMPLETED 0x1 + +/** + * get_completed_synchronize_srcu - Return a pre-completed polled state cookie + * + * Returns a value that poll_state_synchronize_srcu() will always treat + * as a cookie whose grace period has already completed. + */ +static inline unsigned long get_completed_synchronize_srcu(void) +{ + return SRCU_GET_STATE_COMPLETED; +} + unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); @@ -65,6 +79,23 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); // not-yet-completed SRCU grace periods. #define NUM_ACTIVE_SRCU_POLL_OLDSTATE 2 +/** + * same_state_synchronize_srcu - Are two old-state values identical? + * @oldstate1: First old-state value. + * @oldstate2: Second old-state value. + * + * The two old-state values must have been obtained from either + * get_state_synchronize_srcu(), start_poll_synchronize_srcu(), or + * get_completed_synchronize_srcu(). Returns @true if the two values are + * identical and @false otherwise. This allows structures whose lifetimes + * are tracked by old-state values to push these values to a list header, + * allowing those structures to be slightly smaller. + */ +static inline bool same_state_synchronize_srcu(unsigned long oldstate1, unsigned long oldstate2) +{ + return oldstate1 == oldstate2; +} + #ifdef CONFIG_NEED_SRCU_NMI_SAFE int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 5afd5cf494db..549c03336ee9 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -277,7 +277,8 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) unsigned long cur_s = READ_ONCE(ssp->srcu_idx); barrier(); - return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); + return cookie == SRCU_GET_STATE_COMPLETED || + ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 15dc22a8ff5a..d6a4047719cb 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1543,7 +1543,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { - if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) + if (cookie != SRCU_GET_STATE_COMPLETED && + !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) return false; // Ensure that the end of the SRCU grace period happens before // any subsequent code that the caller might execute. -- cgit From 0a5e9bd31e128001939719aa507469ab7bd4f5ec Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 27 Jun 2024 13:27:57 +0200 Subject: rcu: Remove full ordering on second EQS snapshot When the grace period kthread checks the extended quiescent state counter of a CPU, full ordering is necessary to ensure that either: * If the GP kthread observes the remote target in an extended quiescent state, then that target must observe all accesses prior to the current grace period, including the current grace period sequence number, once it exits that extended quiescent state. Also the GP kthread must observe all accesses performed by the target prior it entering in EQS. or: * If the GP kthread observes the remote target NOT in an extended quiescent state, then the target further entering in an extended quiescent state must observe all accesses prior to the current grace period, including the current grace period sequence number, once it enters that extended quiescent state. Also the GP kthread later observing that EQS must also observe all accesses performed by the target prior it entering in EQS. This ordering is explicitly performed both on the first EQS snapshot and on the second one as well through the combination of a preceding full barrier followed by an acquire read. However the second snapshot's full memory barrier is redundant and not needed to enforce the above guarantees: GP kthread Remote target ---- ----- // Access prior GP WRITE_ONCE(A, 1) // first snapshot smp_mb() x = smp_load_acquire(EQS) // Access prior GP WRITE_ONCE(B, 1) // EQS enter // implied full barrier by atomic_add_return() atomic_add_return(RCU_DYNTICKS_IDX, EQS) // implied full barrier by atomic_add_return() READ_ONCE(A) // second snapshot y = smp_load_acquire(EQS) z = READ_ONCE(B) If the GP kthread above fails to observe the remote target in EQS (x not in EQS), the remote target will observe A == 1 after further entering in EQS. Then the second snapshot taken by the GP kthread only need to be an acquire read in order to observe z == 1. Therefore remove the needless full memory barrier on second snapshot. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Boqun Feng Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 28c7031711a3..b7d943e98a9a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -321,7 +321,15 @@ static bool rcu_dynticks_in_eqs(int snap) */ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) { - return snap != rcu_dynticks_snap(rdp->cpu); + /* + * The first failing snapshot is already ordered against the accesses + * performed by the remote CPU after it exits idle. + * + * The second snapshot therefore only needs to order against accesses + * performed by the remote CPU prior to entering idle and therefore can + * rely solely on acquire semantics. + */ + return snap != ct_dynticks_cpu_acquire(rdp->cpu); } /* -- cgit From 9a7e73c9bedfecd00370403b5d35514b2d3aba3d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 27 Jun 2024 13:32:36 +0200 Subject: rcu: Remove superfluous full memory barrier upon first EQS snapshot When the grace period kthread checks the extended quiescent state counter of a CPU, full ordering is necessary to ensure that either: * If the GP kthread observes the remote target in an extended quiescent state, then that target must observe all accesses prior to the current grace period, including the current grace period sequence number, once it exits that extended quiescent state. or: * If the GP kthread observes the remote target NOT in an extended quiescent state, then the target further entering in an extended quiescent state must observe all accesses prior to the current grace period, including the current grace period sequence number, once it enters that extended quiescent state. This ordering is enforced through a full memory barrier placed right before taking the first EQS snapshot. However this is superfluous because the snapshot is taken while holding the target's rnp lock which provides the necessary ordering through its chain of smp_mb__after_unlock_lock(). Remove the needless explicit barrier before the snapshot and put a comment about the implicit barrier newly relied upon here. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Boqun Feng Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- .../RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst | 6 +++--- kernel/rcu/tree.c | 13 ++++++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 5750f125361b..728b1e690c64 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -149,9 +149,9 @@ This case is handled by calls to the strongly ordered ``atomic_add_return()`` read-modify-write atomic operation that is invoked within ``rcu_dynticks_eqs_enter()`` at idle-entry time and within ``rcu_dynticks_eqs_exit()`` at idle-exit time. -The grace-period kthread invokes ``rcu_dynticks_snap()`` and -``rcu_dynticks_in_eqs_since()`` (both of which invoke -an ``atomic_add_return()`` of zero) to detect idle CPUs. +The grace-period kthread invokes first ``ct_dynticks_cpu_acquire()`` +(preceded by a full memory barrier) and ``rcu_dynticks_in_eqs_since()`` +(both of which rely on acquire semantics) to detect idle CPUs. +-----------------------------------------------------------------------+ | **Quick Quiz**: | diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b7d943e98a9a..da5718da280d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -777,7 +777,18 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu); + /* + * Full ordering between remote CPU's post idle accesses and updater's + * accesses prior to current GP (and also the started GP sequence number) + * is enforced by rcu_seq_start() implicit barrier and even further by + * smp_mb__after_unlock_lock() barriers chained all the way throughout the + * rnp locking tree since rcu_gp_init() and up to the current leaf rnp + * locking. + * + * Ordering between remote CPU's pre idle accesses and post grace period + * updater's accesses is enforced by the below acquire semantic. + */ + rdp->dynticks_snap = ct_dynticks_cpu_acquire(rdp->cpu); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); -- cgit From 33c0860bf7e7ace39eba96f51cc6d898ab253202 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 27 Jun 2024 13:36:58 +0200 Subject: rcu/exp: Remove superfluous full memory barrier upon first EQS snapshot When the grace period kthread checks the extended quiescent state counter of a CPU, full ordering is necessary to ensure that either: * If the GP kthread observes the remote target in an extended quiescent state, then that target must observe all accesses prior to the current grace period, including the current grace period sequence number, once it exits that extended quiescent state. or: * If the GP kthread observes the remote target NOT in an extended quiescent state, then the target further entering in an extended quiescent state must observe all accesses prior to the current grace period, including the current grace period sequence number, once it enters that extended quiescent state. This ordering is enforced through a full memory barrier placed right before taking the first EQS snapshot. However this is superfluous because the snapshot is taken while holding the target's rnp lock which provides the necessary ordering through its chain of smp_mb__after_unlock_lock(). Remove the needless explicit barrier before the snapshot and put a comment about the implicit barrier newly relied upon here. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Boqun Feng Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8a1d9c8bd9f7..1dbad2442e8d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -357,7 +357,21 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) !(rnp->qsmaskinitnext & mask)) { mask_ofl_test |= mask; } else { - snap = rcu_dynticks_snap(cpu); + /* + * Full ordering between remote CPU's post idle accesses + * and updater's accesses prior to current GP (and also + * the started GP sequence number) is enforced by + * rcu_seq_start() implicit barrier, relayed by kworkers + * locking and even further by smp_mb__after_unlock_lock() + * barriers chained all the way throughout the rnp locking + * tree since sync_exp_reset_tree() and up to the current + * leaf rnp locking. + * + * Ordering between remote CPU's pre idle accesses and + * post grace period updater's accesses is enforced by the + * below acquire semantic. + */ + snap = ct_dynticks_cpu_acquire(cpu); if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; else -- cgit From e7a3c8ea6e2509f71150fa13b00da3ef2bbe2387 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 May 2024 14:53:30 +0200 Subject: rcu: Remove full memory barrier on boot time eqs sanity check When the boot CPU initializes the per-CPU data on behalf of all possible CPUs, a sanity check is performed on each of them to make sure none is initialized in an extended quiescent state. This check involves a full memory barrier which is useless at this early boot stage. Do a plain access instead. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Boqun Feng Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da5718da280d..57d813781749 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4786,7 +4786,7 @@ rcu_boot_init_percpu_data(int cpu) rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(ct->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu))); + WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; -- cgit From 55911a9f4287c19bf7ef29aeace14044a6ed88cb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 May 2024 14:53:31 +0200 Subject: rcu: Remove full memory barrier on RCU stall printout RCU stall printout fetches the EQS state of a CPU with a preceding full memory barrier. However there is nothing to order this read against at this debugging stage. It is inherently racy when performed remotely. Do a plain read instead. This was the last user of rcu_dynticks_snap(). Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Boqun Feng Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 10 ---------- kernel/rcu/tree_stall.h | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 57d813781749..100094717bda 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -295,16 +295,6 @@ static void rcu_dynticks_eqs_online(void) ct_state_inc(RCU_DYNTICKS_IDX); } -/* - * Snapshot the ->dynticks counter with full ordering so as to allow - * stable comparison of this counter with past and future snapshots. - */ -static int rcu_dynticks_snap(int cpu) -{ - smp_mb(); // Fundamental RCU ordering guarantee. - return ct_dynticks_cpu_acquire(cpu); -} - /* * Return true if the snapshot returned from rcu_dynticks_snap() * indicates that RCU is in an extended quiescent state. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 460efecd077b..4b0e9d7c4c68 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -501,7 +501,7 @@ static void print_cpu_stall_info(int cpu) } delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && - rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); + rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) // Print signed value, as negative values indicate a probable bug. @@ -515,7 +515,7 @@ static void print_cpu_stall_info(int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - rcu_dynticks_snap(cpu) & 0xffff, + ct_dynticks_cpu(cpu) & 0xffff, ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, -- cgit From 677ab23bdf416ec8f3ecaf10d7cc8d0ccb46adab Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 May 2024 14:53:32 +0200 Subject: rcu/exp: Remove redundant full memory barrier at the end of GP A full memory barrier is necessary at the end of the expedited grace period to order: 1) The grace period completion (pictured by the GP sequence number) with all preceding accesses. This pairs with rcu_seq_end() performed by the concurrent kworker. 2) The grace period completion and subsequent post-GP update side accesses. Pairs again against rcu_seq_end(). This full barrier is already provided by the final sync_exp_work_done() test, making the subsequent explicit one redundant. Remove it and improve comments. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Boqun Feng Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree_exp.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1dbad2442e8d..4acd29d16fdb 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -265,7 +265,12 @@ static bool sync_exp_work_done(unsigned long s) { if (rcu_exp_gp_seq_done(s)) { trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); - smp_mb(); /* Ensure test happens before caller kfree(). */ + /* + * Order GP completion with preceding accesses. Order also GP + * completion with post GP update side accesses. Pairs with + * rcu_seq_end(). + */ + smp_mb(); return true; } return false; @@ -967,7 +972,6 @@ void synchronize_rcu_expedited(void) rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); -- cgit From 68d124b0999919015e6d23008eafea106ec6bb40 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 May 2024 20:11:58 -0700 Subject: rcu: Add rcutree.nohz_full_patience_delay to reduce nohz_full OS jitter If a CPU is running either a userspace application or a guest OS in nohz_full mode, it is possible for a system call to occur just as an RCU grace period is starting. If that CPU also has the scheduling-clock tick enabled for any reason (such as a second runnable task), and if the system was booted with rcutree.use_softirq=0, then RCU can add insult to injury by awakening that CPU's rcuc kthread, resulting in yet another task and yet more OS jitter due to switching to that task, running it, and switching back. In addition, in the common case where that system call is not of excessively long duration, awakening the rcuc task is pointless. This pointlessness is due to the fact that the CPU will enter an extended quiescent state upon returning to the userspace application or guest OS. In this case, the rcuc kthread cannot do anything that the main RCU grace-period kthread cannot do on its behalf, at least if it is given a few additional milliseconds (for example, given the time duration specified by rcutree.jiffies_till_first_fqs, give or take scheduling delays). This commit therefore adds a rcutree.nohz_full_patience_delay kernel boot parameter that specifies the grace period age (in milliseconds, rounded to jiffies) before which RCU will refrain from awakening the rcuc kthread. Preliminary experimentation suggests a value of 1000, that is, one second. Increasing rcutree.nohz_full_patience_delay will increase grace-period latency and in turn increase memory footprint, so systems with constrained memory might choose a smaller value. Systems with less-aggressive OS-jitter requirements might choose the default value of zero, which keeps the traditional immediate-wakeup behavior, thus avoiding increases in grace-period latency. [ paulmck: Apply Leonardo Bras feedback. ] Link: https://lore.kernel.org/all/20240328171949.743211-1-leobras@redhat.com/ Reported-by: Leonardo Bras Suggested-by: Leonardo Bras Suggested-by: Sean Christopherson Signed-off-by: Paul E. McKenney Reviewed-by: Leonardo Bras --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++ kernel/rcu/tree.c | 11 +++++++++-- kernel/rcu/tree_plugin.h | 10 ++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 500cfa776225..79f2020b0a62 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5018,6 +5018,14 @@ the ->nocb_bypass queue. The definition of "too many" is supplied by this kernel boot parameter. + rcutree.nohz_full_patience_delay= [KNL] + On callback-offloaded (rcu_nocbs) CPUs, avoid + disturbing RCU unless the grace period has + reached the specified age in milliseconds. + Defaults to zero. Large values will be capped + at five seconds. All values will be rounded down + to the nearest value representable by jiffies. + rcutree.qhimark= [KNL] Set threshold of queued RCU callbacks beyond which batch limiting is disabled. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 18eba61a6f8b..e942dcf51008 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -176,6 +176,9 @@ static int gp_init_delay; module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +static int nohz_full_patience_delay; +module_param(nohz_full_patience_delay, int, 0444); +static int nohz_full_patience_delay_jiffies; // Add delay to rcu_read_unlock() for strict grace periods. static int rcu_unlock_delay; @@ -4335,11 +4338,15 @@ static int rcu_pending(int user) return 1; /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */ - if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu()) + gp_in_progress = rcu_gp_in_progress(); + if ((user || rcu_is_cpu_rrupt_from_idle() || + (gp_in_progress && + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + + nohz_full_patience_delay_jiffies))) && + rcu_nohz_full_cpu()) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - gp_in_progress = rcu_gp_in_progress(); if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress) return 1; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 340bbefe5f65..009ce0320f7f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -93,6 +93,16 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay); + if (nohz_full_patience_delay < 0) { + pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay); + nohz_full_patience_delay = 0; + } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) { + pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC); + nohz_full_patience_delay = 5 * MSEC_PER_SEC; + } else if (nohz_full_patience_delay) { + pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay); + } + nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay); if (!use_softirq) pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) -- cgit From 7f09e70f9eae95b12488b931ee249f36e35df542 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 May 2024 17:03:06 -0700 Subject: MAINTAINERS: Add Uladzislau Rezki as RCU maintainer Signed-off-by: Paul E. McKenney Cc: Uladzislau Rezki --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index d6c90161c7bf..3f2047082073 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18868,6 +18868,7 @@ M: Neeraj Upadhyay (kernel/rcu/tasks.h) M: Joel Fernandes M: Josh Triplett M: Boqun Feng +M: Uladzislau Rezki R: Steven Rostedt R: Mathieu Desnoyers R: Lai Jiangshan -- cgit From 6f4cec22c38a33c1981e8f39cdc698119903f1cb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 12 May 2024 08:02:07 -0700 Subject: rcu: Eliminate lockless accesses to rcu_sync->gp_count The rcu_sync structure's ->gp_count field is always accessed under the protection of that same structure's ->rss_lock field, with the exception of a pair of WARN_ON_ONCE() calls just prior to acquiring that lock in functions rcu_sync_exit() and rcu_sync_dtor(). These lockless accesses are unnecessary and impair KCSAN's ability to catch bugs that might be inserted via other lockless accesses. This commit therefore moves those WARN_ON_ONCE() calls under the lock. Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- kernel/rcu/sync.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 6c2bd9001adc..da60a9947c00 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -122,7 +122,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) * we are called at early boot time but this shouldn't happen. */ } - WRITE_ONCE(rsp->gp_count, rsp->gp_count + 1); + rsp->gp_count++; spin_unlock_irq(&rsp->rss_lock); if (gp_state == GP_IDLE) { @@ -151,15 +151,11 @@ void rcu_sync_enter(struct rcu_sync *rsp) */ void rcu_sync_exit(struct rcu_sync *rsp) { - int gpc; - WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); - WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); spin_lock_irq(&rsp->rss_lock); - gpc = rsp->gp_count - 1; - WRITE_ONCE(rsp->gp_count, gpc); - if (!gpc) { + WARN_ON_ONCE(rsp->gp_count == 0); + if (!--rsp->gp_count) { if (rsp->gp_state == GP_PASSED) { WRITE_ONCE(rsp->gp_state, GP_EXIT); rcu_sync_call(rsp); @@ -178,10 +174,10 @@ void rcu_sync_dtor(struct rcu_sync *rsp) { int gp_state; - WARN_ON_ONCE(READ_ONCE(rsp->gp_count)); WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irq(&rsp->rss_lock); + WARN_ON_ONCE(rsp->gp_count); if (rsp->gp_state == GP_REPLAY) WRITE_ONCE(rsp->gp_state, GP_EXIT); gp_state = rsp->gp_state; -- cgit From 55d4669ef1b76823083caecfab12a8bd2ccdcf64 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 24 May 2024 16:05:24 +0200 Subject: rcu: Fix rcu_barrier() VS post CPUHP_TEARDOWN_CPU invocation When rcu_barrier() calls rcu_rdp_cpu_online() and observes a CPU off rnp->qsmaskinitnext, it means that all accesses from the offline CPU preceding the CPUHP_TEARDOWN_CPU are visible to RCU barrier, including callbacks expiration and counter updates. However interrupts can still fire after stop_machine() re-enables interrupts and before rcutree_report_cpu_dead(). The related accesses happening between CPUHP_TEARDOWN_CPU and rnp->qsmaskinitnext clearing are _NOT_ guaranteed to be seen by rcu_barrier() without proper ordering, especially when callbacks are invoked there to the end, making rcutree_migrate_callback() bypass barrier_lock. The following theoretical race example can make rcu_barrier() hang: CPU 0 CPU 1 ----- ----- //cpu_down() smpboot_park_threads() //ksoftirqd is parked now rcu_sched_clock_irq() invoke_rcu_core() do_softirq() rcu_core() rcu_do_batch() // callback storm // rcu_do_batch() returns // before completing all // of them // do_softirq also returns early because of // timeout. It defers to ksoftirqd but // it's parked stop_machine() take_cpu_down() rcu_barrier() spin_lock(barrier_lock) // observes rcu_segcblist_n_cbs(&rdp->cblist) != 0 do_softirq() rcu_core() rcu_do_batch() //completes all pending callbacks //smp_mb() implied _after_ callback number dec rcutree_report_cpu_dead() rnp->qsmaskinitnext &= ~rdp->grpmask; rcutree_migrate_callback() // no callback, early return without locking // barrier_lock //observes !rcu_rdp_cpu_online(rdp) rcu_barrier_entrain() rcu_segcblist_entrain() // Observe rcu_segcblist_n_cbs(rsclp) == 0 // because no barrier between reading // rnp->qsmaskinitnext and rsclp->len rcu_segcblist_add_len() smp_mb__before_atomic() // will now observe the 0 count and empty // list, but too late, we enqueue regardless WRITE_ONCE(rsclp->len, rsclp->len + v); // ignored barrier callback // rcu barrier stall... This could be solved with a read memory barrier, enforcing the message passing between rnp->qsmaskinitnext and rsclp->len, matching the full memory barrier after rsclp->len addition in rcu_segcblist_add_len() performed at the end of rcu_do_batch(). However the rcu_barrier() is complicated enough and probably doesn't need too many more subtleties. CPU down is a slowpath and the barrier_lock seldom contended. Solve the issue with unconditionally locking the barrier_lock on rcutree_migrate_callbacks(). This makes sure that either rcu_barrier() sees the empty queue or its entrained callback will be migrated. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e942dcf51008..58788a08a88a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -5139,11 +5139,15 @@ void rcutree_migrate_callbacks(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); bool needwake; - if (rcu_rdp_is_offloaded(rdp) || - rcu_segcblist_empty(&rdp->cblist)) - return; /* No callbacks to migrate. */ + if (rcu_rdp_is_offloaded(rdp)) + return; raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); + if (rcu_segcblist_empty(&rdp->cblist)) { + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + return; /* No callbacks to migrate. */ + } + WARN_ON_ONCE(rcu_rdp_cpu_online(rdp)); rcu_barrier_entrain(rdp); my_rdp = this_cpu_ptr(&rcu_data); -- cgit